In [1]:
import polars as pl

from nia_dataset_reader import (
    NiaDataPathExtractor,
    DataFrameSplitter,
    NiaDataPathProvider,
)

# 데이터셋 경로 설정

In [2]:
dataset_dir = "/aidata-2023-044-058/058.생활도로 객체인식 자율주행 데이터/1.구축과정산출물/2.최종검증/1.Datasets/"  # S3 내부 경로
dataset_dir = "/datasets/nia/"  # 도커 컨테이너 내부 경로 (마운트 필요)

# 데이터셋 경로 추출

In [3]:
path_extractor = NiaDataPathExtractor(
    dataset_dir=dataset_dir,
    pattern=(
        r"(?P<type>[^/]+)/"
        r"(?P<collector>[^/]+)/"
        r".*?"
        r"(?P<channel>[^/]+)/"
        r"(?P<filename>[^/]+)$"
    ),
)

### 멀티모달 데이터 랜덤 스플릿

In [4]:
splitter = DataFrameSplitter(
    groups=["collector", "scene", "road", "timeslot", "weather"],
    splits=["train", "valid", "test"],
    ratios=[8, 1, 1],
    seed=231111,
)

In [5]:
nia_df = splitter.random_split(
    path_extractor.paired_df,
)

In [6]:
nia_df

index,stem,collector,channel,sensor,scene,road,timeslot,weather,annotation_id,collection_path,annotation_path,split
u32,str,str,str,str,str,str,str,str,str,str,str,str
0,"""CK_A01_R01_day…","""calib(K)""","""image_B""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""CK_A01_R01_day…","""calib(K)""","""image_F""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""CK_A01_R01_day…","""calib(K)""","""keypoint""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""CK_A01_R01_day…","""calib(K)""","""image_L""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""CK_A01_R01_day…","""calib(K)""","""image_R""","""CK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""LK_A01_R01_day…","""calib(K)""","""lidar""","""LK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
0,"""TK_A01_R01_day…","""calib(K)""","""thermal""","""TK""","""A01""","""R01""","""day""","""clear""","""01008462""","""/datasets/nia/…","""/datasets/nia/…","""train"""
1,"""CK_A01_R01_day…","""calib(K)""","""image_B""","""CK""","""A01""","""R01""","""day""","""clear""","""01008463""","""/datasets/nia/…","""/datasets/nia/…","""train"""
1,"""CK_A01_R01_day…","""calib(K)""","""image_F""","""CK""","""A01""","""R01""","""day""","""clear""","""01008463""","""/datasets/nia/…","""/datasets/nia/…","""train"""
1,"""CK_A01_R01_day…","""calib(K)""","""image_L""","""CK""","""A01""","""R01""","""day""","""clear""","""01008463""","""/datasets/nia/…","""/datasets/nia/…","""train"""


### 채널별, 스플릿별 데이터 수량

In [7]:
(
    nia_df
    .group_by("channel", "split")
    .count()
    .sort("channel", "split")
)

channel,split,count
str,str,u32
"""ImageCaption""","""test""",990
"""ImageCaption""","""train""",8006
"""ImageCaption""","""valid""",1030
"""image_B""","""test""",9965
"""image_B""","""train""",80001
"""image_B""","""valid""",10034
"""image_F""","""test""",9965
"""image_F""","""train""",80001
"""image_F""","""valid""",10034
"""image_L""","""test""",9965


### csv로 추출

In [8]:
nia_df.write_csv("multi_modal_dataset_info.csv")