In [1]:
from pathlib import Path
import polars as pl

# Set dataset directory

In [2]:
nia_dataset_dir = "/datasets/nia/"

# Directory should have a trailing slash
if not nia_dataset_dir.endswith("/"):
    nia_dataset_dir += "/"

### Define subdirectory names

In [3]:
pattern = (
    f"{nia_dataset_dir}"
    r"(?P<volume>[^/]+)/"
    r".*?"
    r"(?P<scene>[^/]+)/"
    r"(?P<channel>[^/]+)/"
    r"(?P<filename>[^/]+)$"
)

### Get all file paths and match subdirectory names

In [4]:
paths = pl.Series("path", map(str, Path(nia_dataset_dir).rglob("*.*")))
matched_df = (
    paths
    .str.extract_groups(pattern)
    .struct.unnest()
    .with_columns(paths)
)

In [5]:
matched_df.null_count()

volume,scene,channel,filename,path
u32,u32,u32,u32,u32
0,0,0,0,0


### Get stem and extension from filename

In [6]:
matched_df = matched_df.with_columns(
    pl.col("filename").str.split(".").alias(".splits"),
).with_columns(
    pl.col(".splits").list.first().alias("stem"),
    pl.col(".splits").list.last().alias("extension"),
).drop(".splits")

### Different length of stem split by _

In [7]:
matched_df.select(
    pl.col("stem"),
    pl.col("stem").str.split("_").alias("_splits"),
).with_columns(
    pl.col("_splits").list.len().alias("_length")
).group_by("_length").agg(
    pl.col("stem").count().alias("count"),
    pl.col("stem").first(),
    pl.col("_splits").first(),
)

_length,count,stem,_splits
u32,u32,str,list[str]
4,25200,"""CF_230906_1236…","[""CF"", ""230906"", … ""113""]"
7,127463,"""CK_A05_R03_erh…","[""CK"", ""A05"", … ""F""]"
6,79276,"""TK_A05_R03_erh…","[""TK"", ""A05"", … ""01020234""]"


### Get features from stem

In [8]:
timeslot_values = ["mrh", "day", "lunch", "afterschool", "erh", "night"]
weather_values = ["clear", "rainy", "foggy"]

matched_df = matched_df.with_columns(
    pl.col("stem").str.extract(r"([A-Z]+)", 1).alias("sensor"),
    pl.col("stem").str.extract_all(r"[A-Z]\d{2}").list.to_struct(
        fields=["code_1", "code_2"],
    ).alias("scenario_codes"),
    pl.col("stem").str.extract(f"({'|'.join(timeslot_values)})").alias("timeslot"),
    pl.col("stem").str.extract(f"({'|'.join(weather_values)})").alias("weather"),
    pl.col("stem").str.extract(r"(\d{8})").alias("annotation_id")
).unnest("scenario_codes")

In [9]:
matched_df.null_count()

volume,scene,channel,filename,path,stem,extension,sensor,code_1,code_2,timeslot,weather,annotation_id
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,25200,25200,25200,25200,25200


# Pair collection and annotation

In [10]:
matched_df.unique("channel")

volume,scene,channel,filename,path,stem,extension,sensor,code_1,code_2,timeslot,weather,annotation_id
str,str,str,str,str,str,str,str,str,str,str,str,str
"""230927""","""230822_163601_…","""imageCaption""","""CK_B01_R02_aft…","""/datasets/nia/…","""CK_B01_R02_aft…","""json""","""CK""","""B01""","""R02""","""afterschool""","""clear""","""01000841"""
"""231018""","""230913_181405_…","""Annotation""","""LK_A06_R05_erh…","""/datasets/nia/…","""LK_A06_R05_erh…","""json""","""LK""","""A06""","""R05""","""erh""","""rainy""","""01020891"""
"""231018""","""230913_180453_…","""lidar""","""LK_A05_R03_erh…","""/datasets/nia/…","""LK_A05_R03_erh…","""pcd""","""LK""","""A05""","""R03""","""erh""","""rainy""","""01020229"""
"""231018""","""230913_180453_…","""image_L""","""CK_A05_R03_erh…","""/datasets/nia/…","""CK_A05_R03_erh…","""png""","""CK""","""A05""","""R03""","""erh""","""rainy""","""01020262"""
"""231018""","""230913_180453_…","""image_B""","""CK_A05_R03_erh…","""/datasets/nia/…","""CK_A05_R03_erh…","""png""","""CK""","""A05""","""R03""","""erh""","""rainy""","""01020223"""
"""231018""","""230913_180453_…","""thermal""","""TK_A05_R03_erh…","""/datasets/nia/…","""TK_A05_R03_erh…","""png""","""TK""","""A05""","""R03""","""erh""","""rainy""","""01020234"""
"""231018""","""230913_180453_…","""gps""","""GK_A05_R03_erh…","""/datasets/nia/…","""GK_A05_R03_erh…","""csv""","""GK""","""A05""","""R03""","""erh""","""rainy""","""01020261"""
"""231018""","""230913_180453_…","""image_R""","""CK_A05_R03_erh…","""/datasets/nia/…","""CK_A05_R03_erh…","""png""","""CK""","""A05""","""R03""","""erh""","""rainy""","""01020228"""
"""231018""","""230913_180453_…","""image_F""","""CK_A05_R03_erh…","""/datasets/nia/…","""CK_A05_R03_erh…","""png""","""CK""","""A05""","""R03""","""erh""","""rainy""","""01020256"""


### Group by stem and channel

In [11]:
paired_df = matched_df.group_by("stem", "channel").agg(
    pl.all().exclude("filename", "path", "extension").first(),
    pl.col("path").filter(pl.col("extension") != "json").first().name.prefix("collection_"),
    pl.col("path").filter(pl.col("extension") == "json").first().name.prefix("annotation_"),
)

In [12]:
paired_df.null_count()

stem,channel,volume,scene,sensor,code_1,code_2,timeslot,weather,annotation_id,collection_path,annotation_path
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,25200,25200,25200,25200,25200,23176,42807


### Drop nulls and get all features with complete pairs

In [13]:
# categorical_columns = ["channel", "volume", "scene", "sensor", "code_1", "code_2", "timeslot", "weather"]
# paired_df = paired_df.drop_nulls().with_columns(
#     pl.col(categorical_columns).cast(pl.Categorical).cat.set_ordering("lexical"),
# )

In [14]:
paired_df = paired_df.drop_nulls().sort("stem")
paired_df

stem,channel,volume,scene,sensor,code_1,code_2,timeslot,weather,annotation_id,collection_path,annotation_path
str,str,str,str,str,str,str,str,str,str,str,str
"""CK_A01_R01_erh…","""image_B""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021219""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A01_R01_erh…","""image_F""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021219""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A01_R01_erh…","""image_L""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021219""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A01_R01_erh…","""image_R""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021219""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A01_R01_erh…","""image_B""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021220""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A01_R01_erh…","""image_F""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021220""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A01_R01_erh…","""image_L""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021220""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A01_R01_erh…","""image_R""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021220""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A01_R01_erh…","""image_B""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021221""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A01_R01_erh…","""image_F""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021221""","""/datasets/nia/…","""/datasets/nia/…"


# Prepare dataset splits

## Random splits

### Set hyper-parameters for random splits

In [15]:
seed = 231111
splits = ["train", "valid", "test"]
ratios = [2, 5, 3]
shuffle_groups = ("volume", "channel", "code_1", "code_2", "timeslot", "weather")

In [16]:
split_dfs = []
for group, grouped_df in paired_df.sort("stem").with_row_count("index").group_by(shuffle_groups):
    num_samples = len(grouped_df)

    slice_indices_df = pl.DataFrame({
        "split": splits,
        "ratio": ratios,
    }).with_columns(
        (pl.col("ratio") / pl.col("ratio").sum()).alias("proportion"),
    ).with_columns(
        (pl.col("proportion").cumsum() * num_samples).round().cast(pl.Int64).alias("end_index"),
    ).with_columns(
        pl.col("end_index").shift(1, fill_value=0).alias("start_index"),
    ).with_columns(
        (pl.col("end_index") - pl.col("start_index")).alias('length')
    )

    split_df = grouped_df.with_row_count().select(
        pl.col("index"),
        pl.col("row_nr"),
        pl.lit(None).alias("split"),
    )

    for split, end_index in slice_indices_df.rows_by_key(("split", "end_index")):
        split_df = split_df.with_columns(
            pl.when((pl.col("row_nr") < end_index) & (pl.col("split") == None))
            .then(pl.lit(split))
            .otherwise(pl.col("split"))
            .alias("split")
        )
    
    seed += 1
    random_split_series = split_df.get_column("split").sample(num_samples, shuffle=True, seed=seed)
    
    split_dfs.append(
        split_df.select(
            pl.col("index"),
            random_split_series.alias("split"),
        ),
    )

In [17]:
random_split_series = pl.concat(split_dfs).sort("index").get_column("split")
paired_df.sort("stem").with_columns(random_split_series)

stem,channel,volume,scene,sensor,code_1,code_2,timeslot,weather,annotation_id,collection_path,annotation_path,split
str,str,str,str,str,str,str,str,str,str,str,str,str
"""CK_A01_R01_erh…","""image_B""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021219""","""/datasets/nia/…","""/datasets/nia/…","""train"""
"""CK_A01_R01_erh…","""image_F""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021219""","""/datasets/nia/…","""/datasets/nia/…","""valid"""
"""CK_A01_R01_erh…","""image_L""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021219""","""/datasets/nia/…","""/datasets/nia/…","""test"""
"""CK_A01_R01_erh…","""image_R""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021219""","""/datasets/nia/…","""/datasets/nia/…","""test"""
"""CK_A01_R01_erh…","""image_B""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021220""","""/datasets/nia/…","""/datasets/nia/…","""valid"""
"""CK_A01_R01_erh…","""image_F""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021220""","""/datasets/nia/…","""/datasets/nia/…","""valid"""
"""CK_A01_R01_erh…","""image_L""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021220""","""/datasets/nia/…","""/datasets/nia/…","""test"""
"""CK_A01_R01_erh…","""image_R""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021220""","""/datasets/nia/…","""/datasets/nia/…","""valid"""
"""CK_A01_R01_erh…","""image_B""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021221""","""/datasets/nia/…","""/datasets/nia/…","""train"""
"""CK_A01_R01_erh…","""image_F""","""231018""","""230913_181649_…","""CK""","""A01""","""R01""","""erh""","""rainy""","""01021221""","""/datasets/nia/…","""/datasets/nia/…","""test"""
