In [1]:
from pathlib import Path
import shutil
from concurrent.futures import ThreadPoolExecutor

import polars as pl

from pointcept.datasets.nia_dataset_reader import (
    NiaDataPathExtractor,
    DataFrameSplitter,
    NiaDataPathProvider,
)

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
path_provider = NiaDataPathProvider(
    reader=NiaDataPathExtractor(
        dataset_dir="/datasets/nia/old/",
        pattern=(
            r"(?P<type>[^/]+)/"
            r"(?P<collector>[^/]+)/"
            r".*?"
            r"(?P<channel>[^/]+)/"
            r"(?P<filename>[^/]+)$"
        ),                
    ),
    splitter = DataFrameSplitter(
        groups=["collector", "scene", "road", "timeslot", "weather"],
        splits=["train", "valid", "test"],
        ratios=[8, 1, 1],
        seed=231111,
    ),
)

In [3]:
path_provider.nia_df

index,stem,collector,channel,sensor,scene,road,timeslot,weather,annotation_id,collection_path,annotation_path,split
u32,str,str,str,str,str,str,str,str,str,str,str,str
0,"""CK_A01_R01_day…","""calib_K""","""image_B""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""CK_A01_R01_day…","""calib_K""","""image_F""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""CK_A01_R01_day…","""calib_K""","""keypoint""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""CK_A01_R01_day…","""calib_K""","""image_L""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""CK_A01_R01_day…","""calib_K""","""image_R""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""LK_A01_R01_day…","""calib_K""","""lidar""","""LK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""TK_A01_R01_day…","""calib_K""","""thermal""","""TK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
1,"""CK_A01_R01_day…","""calib_K""","""image_B""","""CK""","""A01""","""R01""","""day""","""clear""","""01008463""","""/datasets/nia/…","""/datasets/nia/…","""train"""
1,"""CK_A01_R01_day…","""calib_K""","""image_F""","""CK""","""A01""","""R01""","""day""","""clear""","""01008463""","""/datasets/nia/…","""/datasets/nia/…","""train"""
1,"""CK_A01_R01_day…","""calib_K""","""image_L""","""CK""","""A01""","""R01""","""day""","""clear""","""01008463""","""/datasets/nia/…","""/datasets/nia/…","""train"""


In [4]:
main_channels = ["image_B", "image_F", "image_L", "image_R", "lidar", "thermal"]
sub_channels = ["ImageCaption", "keypoint"]

old_dataset_dir = "/datasets/nia/old/"
new_dataset_dir = "/datasets/nia/new/"

split_path_map = {
    "train": "1.Training/",
    "valid": "2.Validation/",
    "test": "3.Test/",
}

In [5]:
main_df = path_provider.nia_df.filter(
    pl.col("channel").is_in(main_channels),
).with_columns(
    (
        new_dataset_dir +
        pl.concat_str([
            pl.col("split").replace(split_path_map),
            pl.col("collection_path").str.strip_prefix(old_dataset_dir),
        ])
    ).alias("new_collection_path"),
    (
        new_dataset_dir +
        pl.concat_str([
            pl.col("split").replace(split_path_map),
            pl.col("annotation_path").str.strip_prefix(old_dataset_dir),
        ])
    ).alias("new_annotation_path"),
).rename({
    "collection_path": "old_collection_path",
    "annotation_path": "old_annotation_path",
})

In [6]:
main_df["old_collection_path"][0], main_df["new_collection_path"][0], main_df["old_annotation_path"][0], main_df["new_annotation_path"][0]

('/datasets/nia/old/1.원천데이터/calib_K/가시광이미지/image_B/CK_A01_R01_day_clear_01008462_B.png',
 '/datasets/nia/new/1.Training/1.원천데이터/calib_K/가시광이미지/image_B/CK_A01_R01_day_clear_01008462_B.png',
 '/datasets/nia/old/2.라벨링데이터/calib_K/가시광이미지/image_B/CK_A01_R01_day_clear_01008462_B.png.json',
 '/datasets/nia/new/1.Training/2.라벨링데이터/calib_K/가시광이미지/image_B/CK_A01_R01_day_clear_01008462_B.png.json')

In [7]:
sub_df = path_provider.nia_df.filter(
    pl.col("channel").is_in(sub_channels),
).with_columns(
    (
        new_dataset_dir + "6.서브라벨링/1.원천데이터/" +
        pl.concat_str([
            pl.col("split").replace(split_path_map),
            pl.col("annotation_path").str.strip_prefix(old_dataset_dir + "2.라벨링데이터/서브라벨링/"),
        ])
    ).alias("new_collection_path"),
    (
        new_dataset_dir + "6.서브라벨링/2.라벨링데이터/" +
        pl.concat_str([
            pl.col("split").replace(split_path_map),
            pl.col("annotation_path").str.strip_prefix(old_dataset_dir + "2.라벨링데이터/서브라벨링/"),
        ])
    ).alias("new_annotation_path"),
).rename({"annotation_path": "old_annotation_path"})

In [8]:
sub_df["old_annotation_path"][0], sub_df["new_annotation_path"][0]

('/datasets/nia/old/2.라벨링데이터/서브라벨링/keypoint/CK_A01_R01_day_clear_01008462_F.png.json',
 '/datasets/nia/new/6.서브라벨링/1.Training/keypoint/CK_A01_R01_day_clear_01008462_F.png.json')

In [38]:
def move_file(source, destination):
    source_path = Path(source)
    destination_path = Path(destination)

    destination_path.parent.mkdir(parents=True, exist_ok=True)

    shutil.move(source_path, destination_path)


def move_collection_file(row):
    move_file(row["old_collection_path"], row["new_collection_path"])


def move_annotation_file(row):
    move_file(row["old_annotation_path"], row["new_annotation_path"])


def copy_file(source, destination):
    source_path = Path(source)
    destination_path = Path(destination)

    destination_path.parent.mkdir(parents=True, exist_ok=True)

    shutil.copy2(source_path, destination_path)


def copy_collection_file(row):
    copy_file(row["old_collection_path"], row["new_collection_path"])

In [10]:
with ThreadPoolExecutor(max_workers=128) as executor:
    executor.map(copy_collection_file, sub_df.to_dicts())
    executor.map(move_annotation_file, sub_df.to_dicts())
    
    executor.map(move_collection_file, main_df.to_dicts())
    executor.map(move_annotation_file, main_df.to_dicts())
    