In [68]:
import sys
import os
from pathlib import Path
import json
import joblib
import pandas as pd

# Root Dataset

In [2]:
root_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset"

**Move all video paths into one list**

In [3]:
# Seperate training videos from labels
training_dirs = []
for dir in os.listdir(root_path):
    dir_path = os.path.join(root_path, dir)
    training_dirs.append(dir_path)

# Label directory is first item
label_dir = training_dirs.pop(0)

In [4]:
training_videos = []
for video_dir in training_dirs:
    # Each directory has 1 more directory to enter then videos
    inner_dir = os.listdir(video_dir)[0]
    video_dir_path = os.path.join(video_dir, inner_dir)
    for video_file in os.listdir(video_dir_path):
        # Full video file path
        video_path = os.path.join(video_dir_path, video_file)
        # Aggregate all video paths into 1 list
        training_videos.append(video_path)

**Labels**
- Want peak and trough mask per SVP present video
- Start and end frames of pulsations, may be multiple per video
- Images of peak and trough frames

In [5]:
# JSON Files
disease_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\video_information\disease.json"
disease_variety_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\video_information\disease_variety.json"
temporal_localization_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\Temporal\Temporal_Localization\Temporal_localization.json"
peak_trough_metadata_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\Temporal\Peak_and_trough\Metadata.json"
training_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\Spatial\metadata\training.json"
validation_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\Spatial\metadata\validation.json"
testing_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\Spatial\metadata\testing.json"

In [6]:
# Temporal Localization
with open(temporal_localization_path, 'r') as f:
    temporal_localization = json.load(f)
temporal_localization

{'0002.avi': {'Num_frames': 334,
  'temporal_localization': [[18, 34], [71, 334]]},
 '0005.avi': {'Num_frames': 302,
  'temporal_localization': [[0, 191], [192, 302]]},
 '0009.avi': {'Num_frames': 299,
  'temporal_localization': [[47, 74], [87, 299]]},
 '0010.avi': {'Num_frames': 240, 'temporal_localization': [[0, 240]]},
 '0011.avi': {'Num_frames': 297, 'temporal_localization': [[27, 194]]},
 '0013.avi': {'Num_frames': 129, 'temporal_localization': [[7, 111]]},
 '0014.avi': {'Num_frames': 328, 'temporal_localization': [[0, 308]]},
 '0016.avi': {'Num_frames': 254, 'temporal_localization': [[0, 254]]},
 '0019.avi': {'Num_frames': 260, 'temporal_localization': [[49, 252]]},
 '0020.avi': {'Num_frames': 312, 'temporal_localization': [[0, 293]]},
 '0022.avi': {'Num_frames': 255, 'temporal_localization': [[5, 247]]},
 '0023.avi': {'Num_frames': 238, 'temporal_localization': [[23, 235]]},
 '0025.avi': {'Num_frames': 268, 'temporal_localization': [[0, 256]]},
 '0027.avi': {'Num_frames': 309,
 

In [7]:
# Image and mask files
peak_trough_image_dir = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\Temporal\Peak_and_trough\images"
peak_trough_mask_dir = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\Temporal\Peak_and_trough\masks"

In [8]:
# Load training JSON
with open(training_path, 'r') as f:
    train_dict = json.load(f)
len(train_dict)

485

In [9]:
# Load validation JSON
with open(validation_path, 'r') as f:
    val_dict = json.load(f)
len(val_dict)

32

In [10]:
# Load testing JSON
with open(testing_path, 'r') as f:
    test_dict = json.load(f)
len(test_dict)

118

In [11]:
# Load metadata JSON
with open(peak_trough_metadata_path, 'r') as f:
    peak_trough_metadata = json.load(f)

In [12]:
len(peak_trough_metadata)

335

# Train Validation Test Split

In [13]:
# Retrieve ID's from pre split
training_ids = train_dict.keys()
validation_ids = val_dict.keys()
testing_ids = test_dict.keys()

In [14]:
# Retrieve each training sample only if there is ID match
training_samples = []
for id, value in peak_trough_metadata.items():
    if id in training_ids:
        training_samples.append(value)
len(training_samples)

254

In [15]:
# Retrieve each validation sample only if there is ID match
validation_samples = []
for id, value in peak_trough_metadata.items():
    if id in validation_ids:
        validation_samples.append(value)
len(validation_samples)

13

In [16]:
# Retrieve each testing sample only if there is ID match
testing_samples = []
for id, value in peak_trough_metadata.items():
    if id in testing_ids:
        testing_samples.append(value)
len(testing_samples)

68

In [17]:
len(val_dict)

32

**Segmentation sample splits should be in format:**

{

'Video Title': Video Title ('0001.avi')

'Image': Image File Path,

'Mask': Mask FIle Path,

}

**Temporal Localization sample splits should be in format:**

{

'Video Title': Video Title ('0001.avi')

'Video': Video File Path,

'Start, End': [Start frame, end frame],

'Num_frames': Number of frames

}

**Base paths to join with metadata paths.**

In [44]:
base_frame_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels"
base_mask_path = r"C:\Users\gabe7\Downloads\Fundus_Dataset\Labels"

**Example Image Path:** 
C:\Users\gabe7\Downloads\Fundus_Dataset\Labels\Temporal\Peak_and_trough\masks\0011_104_min.png

**Example metadata path:** 
RVD/Temporal/Peak_and_trough/masks/0011_104_min.png

- Need to remove 'RVD' from metadata path
- Join metadata path with base path
- Append to final dictionary for MONAI use

**Training Split**

In [59]:
train = []
for seg_dict in training_samples:
    for video_file_frame, video_file_mask in seg_dict.items():
        # Remove 'RVD'
        video_file_frame = video_file_frame.replace("RVD/", "", 1)
        video_file_mask = video_file_mask.replace("RVD/", "", 1)
        # Join base path and metadata path
        frame_path = os.path.join(base_frame_path, video_file_frame)
        mask_path = os.path.join(base_mask_path, video_file_mask)
        # Normalize path 
        frame_path = os.path.normpath(frame_path)
        mask_path = os.path.normpath(mask_path)
        # Append to final dictionary
         # Append in MONAI format
        train.append({
            "Image": frame_path,
            "Mask": mask_path
        })

**Validation Split**

In [60]:
validation = []
for seg_dict in validation_samples:
    for video_file_frame, video_file_mask in seg_dict.items():
        # Remove 'RVD'
        video_file_frame = video_file_frame.replace("RVD/", "", 1)
        video_file_mask = video_file_mask.replace("RVD/", "", 1)
        # Join base path and metadata path
        frame_path = os.path.join(base_frame_path, video_file_frame)
        mask_path = os.path.join(base_mask_path, video_file_mask)
        # Normalize path 
        frame_path = os.path.normpath(frame_path)
        mask_path = os.path.normpath(mask_path)
        # Append to final dictionary
         # Append in MONAI format
        validation.append({
            "Image": frame_path,
            "Mask": mask_path
        })

**Testing Split**

In [61]:
test = []
for seg_dict in testing_samples:
    for video_file_frame, video_file_mask in seg_dict.items():
        # Remove 'RVD'
        video_file_frame = video_file_frame.replace("RVD/", "", 1)
        video_file_mask = video_file_mask.replace("RVD/", "", 1)
        # Join base path and metadata path
        frame_path = os.path.join(base_frame_path, video_file_frame)
        mask_path = os.path.join(base_mask_path, video_file_mask)
        # Normalize path 
        frame_path = os.path.normpath(frame_path)
        mask_path = os.path.normpath(mask_path)
        # Append to final dictionary
         # Append in MONAI format
        test.append({
            "Image": frame_path,
            "Mask": mask_path
        })

**Save split file for transfer**

In [70]:
joblib.dump((train, validation, test), 'train_split.joblib')

['train_split.joblib']