In [1]:
from pathlib import Path
import polars as pl
from collections import Counter
import json
import difflib

# 데이터셋 설정

In [2]:
dataset_dir = Path('/datasets/nia/')

# 데이터 클리닝
### Remove unneccessary files which start with "."
### 라벨링데이터 디렉토리에 "._.DS_Store" 파일과 "._폴더명" 파일이 존재함.

In [3]:
files_to_remove = list(dataset_dir.rglob('.*'))
print('삭제될 파일 수', len(files_to_remove))

삭제될 파일 수 0


In [4]:
for p in files_to_remove:
    p.unlink()
print('삭제완료', len(files_to_remove))

삭제완료 0


# 파일명 기반 탐색
## 학습시 최하단 2-depth 까지만 확인
### ex) **/lidar/\*.json, **/thermal/\*.json, **/image_\*/\*.json

In [5]:
annotations_path = list(dataset_dir.rglob('*.json'))
annotations_filenames_set = {p.name for p in annotations_path}
annotations_filenames_list = [p.name for p in annotations_path]
len(annotations_filenames_list) - len(annotations_filenames_set)

410

### 같은 파일이름을 가진 경우, 내용이 같은지 확인

In [6]:
filename_counter = Counter(annotations_filenames_list)
duplicated_filenames = [
    filename
    for filename, count in filename_counter.items()
    if count > 1
]
len(duplicated_filenames)

410

In [7]:
def have_same_annotations(file_paths):
    paths = [Path(p) for p in file_paths]

    anns = []
    for p in paths:
        with open(p, 'r') as fp:
            ann = json.load(fp)
        anns.append(ann)
    
    return all([ann == anns[0] for ann in anns])

In [8]:
duplicated_filename_paths = [list(dataset_dir.rglob(filename)) for filename in duplicated_filenames]
all([have_same_annotations(file_paths) for file_paths in duplicated_filename_paths])

True

### Path의 어느 부분에 차이가 있는지 확인

In [9]:
p1, p2 = duplicated_filename_paths[0]
print('\n'.join(difflib.ndiff([str(p1)], [str(p2)])))

- /datasets/nia/231018/annotations/230913_4/230913_181405_K/230913_181405_K(1)/Annotation/LK_A06_R05_erh_rainy_01020891.json
?                                                                              ^^^^^ ^^^^

+ /datasets/nia/231018/annotations/230913_4/230913_181405_K/230913_181405_K(1)/lidar/LK_A06_R05_erh_rainy_01020891.json
?                                                                              ^^^ ^



In [10]:
all([str(p1).replace('Annotation', 'lidar') == str(p2)
     for p1, p2 in duplicated_filename_paths])


True

# 데이터셋 구조 확인

### 볼륨 확인 (전달받은 데이터 날짜)

In [11]:
volume_paths = sorted(list(dataset_dir.iterdir()))
volume_names = {p.name for p in volume_paths}
print(volume_names)

{'231018', '230927'}


In [12]:
collections_dirs = {p.name: p / 'collections' for p in volume_paths}
annotations_dirs = {p.name: p / 'annotations' for p in volume_paths}

### 확장자 확인

In [13]:
def get_all_suffixes(directory):
    suffixes = set()
    for p in Path(directory).rglob('*'):
        if len(p.suffix) > 0:
            suffixes.add(p.suffix)
    return suffixes

In [14]:
collections_suffixes = {volume: sorted(get_all_suffixes(collections_dir)) for volume, collections_dir in collections_dirs.items()}
annotations_suffixes = {volume: sorted(get_all_suffixes(annotations_dir)) for volume, annotations_dir in annotations_dirs.items()}

In [15]:
pl.DataFrame(collections_suffixes)

230927,231018
str,str
""".csv""",""".csv"""
""".pcd""",""".pcd"""
""".png""",""".png"""


In [16]:
pl.DataFrame(annotations_suffixes)

230927,231018
str,str
""".json""",""".json"""


### 채널 확인

In [17]:
def get_all_channels(directory):
    channels = set()
    for p in Path(directory).rglob('*.*'):
        channels.add(p.parts[-2])
    return channels

In [18]:
collections_channels = {volume: sorted(get_all_channels(collections_dir)) for volume, collections_dir in collections_dirs.items()}
annotations_channels = {volume: sorted(get_all_channels(annotations_dir)) for volume, annotations_dir in annotations_dirs.items()}

In [19]:
pl.DataFrame(collections_channels)

230927,231018
str,str
"""gps""","""gps"""
"""image_B""","""image_B"""
"""image_F""","""image_F"""
"""image_L""","""image_L"""
"""image_R""","""image_R"""
"""lidar""","""lidar"""
"""thermal""","""thermal"""


In [20]:
pl.DataFrame(annotations_channels)

230927,231018
str,str
"""imageCaption""","""Annotation"""
"""image_B""","""image_B"""
"""image_F""","""image_F"""
"""image_L""","""image_L"""
"""image_R""","""image_R"""
"""lidar""","""lidar"""
"""thermal""","""thermal"""


# 데이터셋 디렉토리 패턴

In [21]:
nia_dataset_dir = '/datasets/nia'

volume_values = ['230927', '231018']
type_values = ['annotations', 'collections']
channel_values = ['lidar', 'thermal', 'image_F', 'image_L', 'image_R', 'image_B']

pattern = (
    rf"{nia_dataset_dir}"
    rf"/(?P<volume>{'|'.join(volume_values)})"
    rf"/(?P<type>{'|'.join(type_values)})"
    rf"/.*?/"
    rf"(?P<channel>{'|'.join(channel_values)})"
    rf"/(?P<filename>[^/]+\..+)$"
)

In [22]:
file_paths = pl.Series('full_path', map(str, Path(nia_dataset_dir).rglob('*.*')))
df_extracted = file_paths.str.extract_groups(pattern).struct.unnest()

In [23]:
df = df_extracted.with_columns([
    pl.col('volume').cast(pl.Categorical).cat.set_ordering('lexical'),
    pl.col('type').cast(pl.Categorical).cat.set_ordering('lexical'),
    pl.col('channel').cast(pl.Categorical).cat.set_ordering('lexical'),
    file_paths,
])

## 채널별 이미지, 레이블 데이터 갯수 확인

In [24]:
df.filter(
    pl.col('volume') == '230927',
    pl.col('type') == 'collections',
).group_by('channel').count().sort('channel')

channel,count
cat,u32
"""image_B""",11886
"""image_F""",11886
"""image_L""",11886
"""image_R""",11886
"""lidar""",11886
"""thermal""",11886


In [25]:
df.drop_nulls().group_by(['volume', 'channel']).agg(
    pl.col('filename').filter(pl.col('type') == 'collections').count().alias('collections'),
    pl.col('filename').filter(pl.col('type') == 'annotations').count().alias('annotations'),
).sort('volume', 'channel')

volume,channel,collections,annotations
cat,cat,u32,u32
"""230927""","""image_B""",11886,11730
"""230927""","""image_F""",11886,11730
"""230927""","""image_L""",11886,11730
"""230927""","""image_R""",11886,11730
"""230927""","""lidar""",11886,11730
"""230927""","""thermal""",11886,11730
"""231018""","""image_B""",6207,5759
"""231018""","""image_F""",6141,5759
"""231018""","""image_L""",6007,5759
"""231018""","""image_R""",6007,5759


In [49]:
df_aggregated = df.drop_nulls().with_columns(
    pl.col('filename').str.split('.').list.first().alias('stem'),
).group_by('stem').agg(
    pl.col('volume').first(),
    pl.col('channel').first(),
    pl.col('full_path').filter(pl.col('type') == 'collections').first().alias('collection_path'),
    pl.col('full_path').filter(pl.col('type') == 'annotations').first().alias('annotation_path'),
)

In [50]:
df_unmatched = df_aggregated.filter(
    pl.any_horizontal(
        pl.col('collection_path').is_null(),
        pl.col('annotation_path').is_null(),
    ),
)

df_matched = df_aggregated.filter(
    pl.all_horizontal(
        pl.col('collection_path').is_not_null(),
        pl.col('annotation_path').is_not_null(),
    ),
)

In [51]:
df_matched

stem,volume,channel,collection_path,annotation_path
str,cat,cat,str,str
"""LK_B01_R04_aft…","""230927""","""lidar""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A14_R01_aft…","""231018""","""image_L""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A04_R04_erh…","""230927""","""image_F""","""/datasets/nia/…","""/datasets/nia/…"
"""LK_A09_R03_aft…","""231018""","""lidar""","""/datasets/nia/…","""/datasets/nia/…"
"""TK_B09_R04_erh…","""230927""","""thermal""","""/datasets/nia/…","""/datasets/nia/…"
"""LK_A04_R03_erh…","""231018""","""lidar""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_B13_R03_aft…","""230927""","""image_B""","""/datasets/nia/…","""/datasets/nia/…"
"""LK_A03_R04_erh…","""230927""","""lidar""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A03_R04_erh…","""230927""","""image_R""","""/datasets/nia/…","""/datasets/nia/…"
"""CK_A06_R04_erh…","""230927""","""image_F""","""/datasets/nia/…","""/datasets/nia/…"
