In [None]:
# splits real clouds & synthetic clouds datasets
# A: synthetic dataset, B: real dataset
# trainA, testA, trainB, testB output to 'cloud/dataset' dir (create the dir first) 

from pathlib import Path
from typing import Tuple, List
import logging
import os

In [None]:
def browse_folder(
        path: Path,
        A: str,
        B: str) -> Tuple[List[Path], List[Path]]:
    """
    Browse a directory and return a list of all the jpg files in it and its subfolder.
    """
    logging.info(f"Browsing folder {path}")
    filenames_synth, filenames_real = [], []
    for p in path.iterdir():
        if p.is_dir():
            filenames = browse_folder(p, A, B)
            filenames_real.extend(filenames[0])
            filenames_synth.extend(filenames[1])
        elif p.suffix == '.jpg':
            if A in str(p):
                path_synth = p
                path_real = Path(str(p).replace(A, B))
                filenames_synth.append(path_synth)
                filenames_real.append(path_real)
                
    return filenames_synth, filenames_real #to split into train and test datasets

path_main = Path('dataset') #MUST replace with full path if on windows
paths_synth, paths_real = browse_folder(path_main, 'synth', 'real')

In [3]:
def check_existence(
        filenames: List[Path]) -> None:
    """
    Check if the paths exist.
    Args:
        filenames (List[Path]): List of filenames.
    Returns:
        None
    """
    for file_path in filenames:
        assert file_path.exists(), f'Msissing corresponding real patch of {file_path}'

check_existence(paths_synth)
check_existence(paths_real)

In [4]:
def create_folders(
        input_dir: Path, 
        folder_list: List[str]) -> None:
    """
    Create folders for training and testing data.
    Args:
        input_dir (Path): Input directory.
        folder_list (List[str]): List of folder names to create.
    Returns:
        None
    """
    os.makedirs(input_dir/folder_list[0], exist_ok=True)
    os.makedirs(input_dir/folder_list[1], exist_ok=True)
    os.makedirs(input_dir/folder_list[2], exist_ok=True)
    os.makedirs(input_dir/folder_list[3], exist_ok=True)

folders_list = ['trainA', 'testA', 'trainB', 'testB']
create_folders(path_main, folders_list)


In [10]:
def split_train_test(
        filenames: List[Path], 
        alpha: float = 0.9) -> Tuple[List[Path], List[Path]]:
    """
    Split the filenames into training and testing sets.
    Args:
        filenames (List[Path]): List of filenames.
        alpha (float): Ratio of training data to total data.
    Returns:
        Tuple[List[Path], List[Path]]: Training and testing filenames.
    """
    assert 0 < alpha < 1 
    n = len(filenames)
    n_train = int(n * alpha)
    filenames_train = filenames[:n_train]
    filenames_test = filenames[n_train:]
    return filenames_train, filenames_test

paths_trainA, paths_testA = split_train_test(paths_synth) 
paths_trainB, paths_testB = split_train_test(paths_real)

In [None]:
def create_symlinks(
        filenames: List[Path],
        #input_dir: Path, (excluded)
        split_dir: Path) -> None:
    """
    Create symbolic links for the training and testing images.
    Args:
        filenames (List[Path]): List of filenames.
        input_dir (Path): Input directory.
        split_dir (Path): Split directory.
    Returns:
        None
    """
    for i in range(len(filenames)):
        os.symlink(filenames[i], f'{split_dir}/{i}.jpg')

create_symlinks(paths_trainA, path_main/folders_list[0])
create_symlinks(paths_testA, path_main/folders_list[1])
create_symlinks(paths_trainB, path_main/folders_list[2])
create_symlinks(paths_testB, path_main/folders_list[3])

dataset\patches_real\0.jpg
dataset\patches_real\1.jpg
dataset\patches_real\10.jpg
dataset\patches_real\100.jpg
dataset\patches_real\1000.jpg
dataset\patches_real\10000.jpg
dataset\patches_real\10001.jpg
dataset\patches_real\10002.jpg
dataset\patches_real\10003.jpg
dataset\patches_real\10004.jpg
dataset\patches_real\10005.jpg
dataset\patches_real\10006.jpg
dataset\patches_real\10007.jpg
dataset\patches_real\10008.jpg
dataset\patches_real\10009.jpg
dataset\patches_real\1001.jpg
dataset\patches_real\10010.jpg
dataset\patches_real\10011.jpg
dataset\patches_real\10012.jpg
dataset\patches_real\10013.jpg
dataset\patches_real\10014.jpg
dataset\patches_real\10015.jpg
dataset\patches_real\10016.jpg
dataset\patches_real\10017.jpg
dataset\patches_real\10018.jpg
dataset\patches_real\10019.jpg
dataset\patches_real\1002.jpg
dataset\patches_real\10020.jpg
dataset\patches_real\10021.jpg
dataset\patches_real\10022.jpg
dataset\patches_real\10023.jpg
dataset\patches_real\10024.jpg
dataset\patches_real\100