In [None]:
import os.path
from dataset_analysis import *
import random

# Utils

In [None]:
def path_generator(dataset_root_dir, dataset_split_list):
    """
    A generator that yields the path of the annotation file, the path of the RGB folder and the path of the thermal folder.
    :param dataset_root_dir:
    :param dataset_split_list: e.g. [0,1,2,3,...]
    :return:
    """
    rgb_top_dir = os.path.join(dataset_root_dir, 'sequences', 'RGB')
    thermal_top_dir = os.path.join(dataset_root_dir, 'sequences', 'Thermal')
    annot_top_dir = os.path.join(dataset_root_dir, 'annotations')

    dataset_split_list_iter = tqdm(dataset_split_list, leave=False)
    dataset_split_list_iter.set_description('progress: ')
    for idx in dataset_split_list_iter:
        idx = int(idx)
        annot_path = os.path.join(annot_top_dir, f'{idx}.xml')
        rgb_dir = os.path.join(rgb_top_dir, f'{idx}')
        thermal_dir = os.path.join(thermal_top_dir, f'{idx}')

        yield idx, annot_path, rgb_dir, thermal_dir

In [None]:
def annot_dict_generator(dataset_root_dir, dataset_split_list):
    for idx, annot_path, _, _ in path_generator(dataset_root_dir, dataset_split_list):
        _xml_dict = parse_single_annotation_file(annot_path)
        yield idx, _xml_dict, annot_path

In [None]:
def get_dataset_split_list(split_path):
    with open(split_path,'r') as f:
        split_list = f.readlines()
        # convert to int
        split_list = [int(x.strip()) for x in split_list]
    return split_list

# Overall Statistics

In [None]:
dataset_root_dir = './DATASET_ROOT'

total_video_sequences = count_sequences(dataset_root_dir)
total_frames = 0

total_category_distribution_frame_level_dict = {}

total_sequence_length_distribution = {}

total_outside_distribution_frame_level_dict = {}
total_occlusion_distribution_frame_level_dict = {}
total_altitude_distribution_frame_level_dict = {}
total_illumination_distribution_frame_level_dict = {}
# total_keep_out_distribution_frame_level_dict = {}
# total_cam_movement_distribution_frame_level_dict = {}
total_scene_distribution_frame_level_dict = {}

In [None]:
dataset_split_list = get_dataset_split_list(os.path.join(dataset_root_dir,'all.txt'))
for idx, xml_dict, xml_path in annot_dict_generator(dataset_root_dir, dataset_split_list):
    # if count >= 30:
    #     break # for debug only
    try:
        # frames number update---------------------------------------------------------------------------------
        frames = count_frames_per_sequence(xml_dict)
        total_frames += frames
        total_sequence_length_distribution[f'{idx}'] = frames
        # frame-level category distribution--------------------------------------------------------------------
        category_count_frame_level_dict = count_category_occurrences_frame_level_per_sequence(xml_dict)
        total_category_distribution_frame_level_dict = merge_dicts(
            total_category_distribution_frame_level_dict,
            category_count_frame_level_dict
        )
        # frame-level attribute distribution--------------------------------------------------------------------
        outside_distribution_frame_level_dict, occlusion_distribution_frame_level_dict, altitude_distribution_frame_level_dict, illumination_distribution_frame_level_dict, scene_distribution_frame_level_dict = count_attribute_occurrence_frame_level_per_sequence(
            xml_dict)
    except Exception as e:
        print("\033[1;31merror\033[0m")  # print bold "error" in red
        traceback.print_exc()
        print(f"xml file:{xml_path}")

    total_outside_distribution_frame_level_dict = merge_dicts(total_outside_distribution_frame_level_dict,
                                                              outside_distribution_frame_level_dict)
    total_occlusion_distribution_frame_level_dict = merge_dicts(total_occlusion_distribution_frame_level_dict,
                                                                occlusion_distribution_frame_level_dict)
    total_altitude_distribution_frame_level_dict = merge_dicts(total_altitude_distribution_frame_level_dict,
                                                               altitude_distribution_frame_level_dict)
    total_illumination_distribution_frame_level_dict = merge_dicts(total_illumination_distribution_frame_level_dict,
                                                                   illumination_distribution_frame_level_dict)
    total_scene_distribution_frame_level_dict = merge_dicts(total_scene_distribution_frame_level_dict,
                                                            scene_distribution_frame_level_dict)

In [None]:
total_video_sequences

In [None]:
total_frames

In [None]:
total_sequence_length_distribution

In [None]:
total_category_distribution_frame_level_dict

In [None]:
total_altitude_distribution_frame_level_dict

In [None]:
total_illumination_distribution_frame_level_dict

In [None]:
total_occlusion_distribution_frame_level_dict

In [None]:
total_outside_distribution_frame_level_dict

In [None]:
total_scene_distribution_frame_level_dict

In [None]:
count_error_files(dataset_root_dir)

In [None]:
draw_bar_chart(total_sequence_length_distribution, title="Sequence Length Distribution")

In [None]:
draw_bar_chart(dict(sorted(total_sequence_length_distribution.items(),key=lambda x:x[1])), title="Sequence Length Distribution Sorted")

In [None]:
draw_pie_chart(total_category_distribution_frame_level_dict, title="Frame-level Category Distribution")

In [None]:
draw_pie_chart(total_scene_distribution_frame_level_dict, title="Frame-level Scene Distribution")

In [None]:
draw_pie_chart(total_altitude_distribution_frame_level_dict, title="Frame-level Altitudes")

In [None]:
draw_pie_chart(total_occlusion_distribution_frame_level_dict, title="Frame-level Occlusions")

In [None]:
draw_pie_chart(total_outside_distribution_frame_level_dict, title="Frame-level Outside")

# Dataset Split

In [None]:
# get_dataset_split_list('D:\\Project_repository\\RGBT_multi_dataset\\DATASET_ROOT\\test.txt')

In [None]:
# with open('D:\\Project_repository\\RGBT_multi_dataset\\DATASET_ROOT\\all.txt','w') as f:
#     for i in range(120):
#         f.write(str(i)+'\n')

## Split

In [None]:
sequence_info_dict_list = []  # {"idx": int, "frames": int, "categories": Dict, "altitude": str}

for idx, xml_dict, xml_path in annot_dict_generator(dataset_root_dir, dataset_split_list):
    sequence_info_dict = {
        "idx": idx,
        "frames": count_frames_per_sequence(xml_dict),
        "categories": count_category_occurrences_frame_level_per_sequence(xml_dict),
        "altitude": {'30m': 0, '60m': 0, '90m': 0, '120m': 0}
    }
    sequence_info_dict['altitude'][xml_dict['altitude']] = sequence_info_dict['frames']
    sequence_info_dict_list.append(sequence_info_dict)

for i, sequence_info_dict in enumerate(sequence_info_dict_list):
    assert sequence_info_dict['idx'] == i

In [None]:
## ----------------------------------------------------------------------------------------------------------

train_prop = 0.6
val_prop = 0.1
test_prop = 0.3

train_frames_tg = int(total_frames * train_prop + 0.5)  # target train frames
val_frames_tg = int(total_frames * val_prop + 0.5)  # target val frames
test_frames_tg = int(total_frames * test_prop + 0.5)  # target test frames

category_proportion = {}
for category in total_category_distribution_frame_level_dict:
    category_proportion[category] = total_category_distribution_frame_level_dict[category] / total_frames

altitudes_proportion = {}
for altitude in total_altitude_distribution_frame_level_dict:
    altitudes_proportion[altitude] = total_altitude_distribution_frame_level_dict[altitude] / total_frames

tolerance = 0.1  # tolerance

# the target number of frames for each altitude
train_altitude_count_tg = {}
for altitude in altitudes_proportion:
    train_altitude_count_tg[altitude] = int(train_frames_tg * altitudes_proportion[altitude] + 0.5)
val_altitude_count_tg = {}
for altitude in altitudes_proportion:
    val_altitude_count_tg[altitude] = int(val_frames_tg * altitudes_proportion[altitude] + 0.5)
test_altitude_count_tg = {}
for altitude in altitudes_proportion:
    test_altitude_count_tg[altitude] = int(test_frames_tg * altitudes_proportion[altitude] + 0.5)

# the target number of counts for each category
total_category_count = 0
for category in total_category_distribution_frame_level_dict:
    total_category_count += total_category_distribution_frame_level_dict[category]
category_proportion = {}
for category in total_category_distribution_frame_level_dict:
    category_proportion[category] = total_category_distribution_frame_level_dict[category] / total_category_count

train_category_count_tg = {}
for category in category_proportion:
    [category] = int(train_prop * total_category_distribution_frame_level_dict[category] + 0.5)
val_category_count_tg = {}
for category in category_proportion:
    val_category_count_tg[category] = int(val_prop * total_category_distribution_frame_level_dict[category] + 0.5)
test_category_count_tg = {}
for category in category_proportion:
    test_category_count_tg[category] = int(test_prop * total_category_distribution_frame_level_dict[category] + 0.5)


In [None]:
from queue import PriorityQueue

sorted_sequences_dict = dict(sorted(total_sequence_length_distribution.items(), key=lambda x: x[1], reverse=True))

sorted_sequences_idx_list = list(sorted_sequences_dict.keys())  # iterate from longest to shortest
frames_len_list = list(sorted_sequences_dict.values())

In [None]:
# node structure: (['train'|'val'|'test', ...],
# (train frames, ...)
# (train_category_count_dict,val_category_count_dict, test_category_count_dict),
# (altitude_count_dict))

def heuristic(node, tolerance, train_tg, val_tg, test_tg):
    """
    the heuristic function for A* search, the lower, the better
    """
    train_frames_tg, train_category_count_tg, train_altitude_count_tg = train_tg
    val_frames_tg, val_category_count_tg, val_altitude_count_tg = val_tg
    test_frames_tg, test_category_count_tg, test_altitude_count_tg = test_tg

    node_train_frames, node_val_frames, node_test_frames = node[1]
    node_train_category_count, node_val_category_count, node_test_category_count = node[2]
    node_train_altitude_count, node_val_altitude_count, node_test_altitude_count = node[3]

    # if frames are too much, return inf
    if node_train_frames > train_frames_tg * (1 + tolerance) or node_val_frames > val_frames_tg * (1 + tolerance) \
            or node_test_frames > test_frames_tg * (1 + tolerance):
        return float('inf')

    # if category counts exceed target too much, return inf
    if node_train_category_count > train_category_count_tg * (1 + tolerance) or \
            node_val_category_count > val_category_count_tg * (1 + tolerance) or \
            node_test_category_count > test_category_count_tg * (1 + tolerance):
        return float('inf')

    # if altitude counts exceed target too much, return inf
    if node_train_altitude_count > train_altitude_count_tg * (1 + tolerance) or \
            node_val_altitude_count > val_altitude_count_tg * (1 + tolerance) or \
            node_test_altitude_count > test_altitude_count_tg * (1 + tolerance):
        return float('inf')

    # heuristic
    frames_prop_score = abs(node_train_frames / train_frames_tg - 1) + \
                        abs(node_val_frames / val_frames_tg - 1) + \
                        abs(node_test_frames / test_frames_tg - 1)
    category_prop_score = abs(node_train_category_count / train_category_count_tg - 1) + \
                          abs(node_val_category_count / val_category_count_tg - 1) + \
                          abs(node_test_category_count / test_category_count_tg - 1)
    altitude_prop_score = abs(node_train_altitude_count / train_altitude_count_tg - 1) + \
                          abs(node_val_altitude_count / val_altitude_count_tg - 1) + \
                          abs(node_test_altitude_count / test_altitude_count_tg - 1)

    return frames_prop_score + category_prop_score + altitude_prop_score


# iter order: val, test, train
priority_queue = PriorityQueue()

# first node is val
first_node = (['val'], (0, sequence_info_dict_list[sorted_sequences_idx_list[0]]['frames'], 0),  # frames count
              ({'person': 0, 'cycle': 0, 'car': 0},  # train_category_count_dict
               sequence_info_dict_list[sorted_sequences_idx_list[0]]['categories'],  # val_category_count_dict
               {'person': 0, 'cycle': 0, 'car': 0},),  # test_category_count_dict
              ({'30m': 0, '60m': 0, '90m': 0, '120m': 0},  # train altitude_count_dict
               sequence_info_dict_list[sorted_sequences_idx_list[0]]['altitude'],  # val altitude_count_dict
               {'30m': 0, '60m': 0, '90m': 0, '120m': 0}))  # test altitude_count_dict

priority_queue.put(
    (heuristic(first_node, tolerance, (train_frames_tg, train_category_count_tg, train_altitude_count_tg),
               (val_frames_tg, val_category_count_tg, val_altitude_count_tg),
               (test_frames_tg, test_category_count_tg, test_altitude_count_tg)), first_node))

# A* search
while not priority_queue.empty():
    current_node = priority_queue.get()[1]
    current_split_list = current_node[0]  # ['train'|'val'|'test', ...]
    current_node_train_frames, current_node_val_frames, current_node_test_frames = current_node[1]
    current_node_train_category_count, current_node_val_category_count, current_node_test_category_count = current_node[2]
    current_node_train_altitude_count, current_node_val_altitude_count, current_node_test_altitude_count = current_node[3]

    # if current node is the last one, break
    if len(current_node[0]) == len(sorted_sequences_idx_list):
        break

    # if current node is not the last one, add next node to priority queue
    next_node_idx = len(current_node[0])

    next_node = (current_split_list + ['val'],
                 (current_node_train_frames, current_node_val_frames + frames_len_list[next_node_idx],
                  current_node_test_frames),
                 (current_node_train_category_count,
                  current_node_val_category_count + sequence_info_dict_list[sorted_sequences_idx_list[next_node_idx]][
                      'categories'], current_node_test_category_count),
                 (current_node_train_altitude_count,
                  current_node_val_altitude_count + sequence_info_dict_list[sorted_sequences_idx_list[next_node_idx]][
                      'altitude'], current_node_test_altitude_count))

    priority_queue.put(
        (heuristic(next_node, tolerance, (train_frames_tg, train_category_count_tg, train_altitude_count_tg),
                   (val_frames_tg, val_category_count_tg, val_altitude_count_tg),
                   (test_frames_tg, test_category_count_tg, test_altitude_count_tg)), next_node))

    next_node = (current_split_list + ['test'],
                 (current_node_train_frames, current_node_val_frames,
                  current_node_test_frames + frames_len_list[next_node_idx]),
                 (current_node_train_category_count, current_node_val_category_count,
                  current_node_test_category_count + sequence_info_dict_list[sorted_sequences_idx_list[next_node_idx]][
                      'categories']),
                 (current_node_train_altitude_count, current_node_val_altitude_count,
                  current_node_test_altitude_count + sequence_info_dict_list[sorted_sequences_idx_list[next_node_idx]][
                      'altitude']))

    priority_queue.put(
        (heuristic(next_node, tolerance, (train_frames_tg, train_category_count_tg, train_altitude_count_tg),
                   (val_frames_tg, val_category_count_tg, val_altitude_count_tg),
                   (test_frames_tg, test_category_count_tg, test_altitude_count_tg)), next_node))

    next_node = (current_split_list + ['train'],
                 (current_node_train_frames + frames_len_list[next_node_idx], current_node_val_frames,
                  current_node_test_frames),
                 (current_node_train_category_count + sequence_info_dict_list[sorted_sequences_idx_list[next_node_idx]][
                     'categories'], current_node_val_category_count, current_node_test_category_count),
                 (current_node_train_altitude_count + sequence_info_dict_list[sorted_sequences_idx_list[next_node_idx]][
                     'altitude'], current_node_val_altitude_count, current_node_test_altitude_count))

    priority_queue.put(
        (heuristic(next_node, tolerance, (train_frames_tg, train_category_count_tg, train_altitude_count_tg),
                     (val_frames_tg, val_category_count_tg, val_altitude_count_tg),
                        (test_frames_tg, test_category_count_tg, test_altitude_count_tg)), next_node))

    print(f'{len(current_split_list)} / 120')

# print result
print(current_node)

In [None]:
# random.seed(1)
#
# def random_dic(dicts):
#     dict_key_ls = list(dicts.keys())
#     random.shuffle(dict_key_ls)
#     new_dic = {}
#     for key in dict_key_ls:
#         new_dic[key] = dicts.get(key)
#     return new_dic
#
# # shuffled_thermal_len_dict = dict(sorted(total_sequence_length_distribution.items(),key=lambda x:x[1]))
# shuffled_thermal_len_dict = random_dic(total_sequence_length_distribution)  # shuffled, don't use sorted
# # count total
# total = 0
# for key, val in shuffled_thermal_len_dict.items():
#     total += val
#
# # proportion， 12:6:1
# train_num = int(total * 0.632 + 0.5)
# val_num = int(total * 0.316 + 0.5)
# test_num = total - train_num - val_num
#
# train_list = []
# val_list = []
# test_list = []
# for key, val in shuffled_thermal_len_dict.items():
#     if train_num > 0 and train_num - val >= 0:  # if training set is not full
#         train_list.append(f'{key}\n')
#         train_num -= val
#     elif val_num > 0 and val_num - val >= 0:  # val
#         val_list.append(f'{key}\n')
#         val_num -= val
#     else:  # test
#         test_list.append(f'{key}\n')
#
# train_txt_path = os.path.join(dataset_root_dir,'train.txt')
# val_txt_path = os.path.join(dataset_root_dir,'val.txt')
# test_txt_path = os.path.join(dataset_root_dir,'test.txt')
#
# with open(train_txt_path, 'w') as f:
#     f.writelines(train_list)
# with open(val_txt_path, 'w') as f:
#     f.writelines(val_list)
# with open(test_txt_path, 'w') as f:
#     f.writelines(test_list)

## Statistics

### Train

In [None]:
train_video_sequences = 0
train_frames = 0

train_category_distribution_frame_level_dict = {}

train_sequence_length_distribution = {}

train_outside_distribution_frame_level_dict = {}
train_occlusion_distribution_frame_level_dict = {}
train_altitude_distribution_frame_level_dict = {}
train_illumination_distribution_frame_level_dict = {}
train_scene_distribution_frame_level_dict = {}

In [None]:
dataset_split_list = get_dataset_split_list(os.path.join(dataset_root_dir,'train.txt'))
for idx, xml_dict, xml_path in annot_dict_generator(dataset_root_dir, dataset_split_list):
    # if count >= 30:
    #     break # for debug only
    try:
        # frames number update---------------------------------------------------------------------------------
        train_video_sequences += 1
        frames = count_frames_per_sequence(xml_dict)
        train_frames += frames
        train_sequence_length_distribution[f'{idx}'] = frames
        # frame-level category distribution--------------------------------------------------------------------
        category_count_frame_level_dict = count_category_occurrences_frame_level_per_sequence(xml_dict)
        train_category_distribution_frame_level_dict = merge_dicts(
            train_category_distribution_frame_level_dict,
            category_count_frame_level_dict
        )
        # frame-level attribute distribution--------------------------------------------------------------------
        outside_distribution_frame_level_dict, occlusion_distribution_frame_level_dict, altitude_distribution_frame_level_dict, illumination_distribution_frame_level_dict, scene_distribution_frame_level_dict = count_attribute_occurrence_frame_level_per_sequence(
            xml_dict)
    except Exception as e:
        print("\033[1;31merror\033[0m")  # print bold "error" in red
        traceback.print_exc()
        print(f"xml file:{xml_path}")

    train_outside_distribution_frame_level_dict = merge_dicts(train_outside_distribution_frame_level_dict,
                                                              outside_distribution_frame_level_dict)
    train_occlusion_distribution_frame_level_dict = merge_dicts(train_occlusion_distribution_frame_level_dict,
                                                                occlusion_distribution_frame_level_dict)
    train_altitude_distribution_frame_level_dict = merge_dicts(train_altitude_distribution_frame_level_dict,
                                                               altitude_distribution_frame_level_dict)
    train_illumination_distribution_frame_level_dict = merge_dicts(train_illumination_distribution_frame_level_dict,
                                                                   illumination_distribution_frame_level_dict)
    train_scene_distribution_frame_level_dict = merge_dicts(train_scene_distribution_frame_level_dict,
                                                            scene_distribution_frame_level_dict)

In [None]:
train_video_sequences

In [None]:
train_frames

In [None]:
train_sequence_length_distribution

In [None]:
train_category_distribution_frame_level_dict

In [None]:
train_altitude_distribution_frame_level_dict

In [None]:
train_illumination_distribution_frame_level_dict

In [None]:
train_occlusion_distribution_frame_level_dict

In [None]:
train_outside_distribution_frame_level_dict

In [None]:
train_scene_distribution_frame_level_dict

In [None]:
draw_bar_chart(train_sequence_length_distribution, title="Sequence Length Distribution")

In [None]:
draw_pie_chart(train_category_distribution_frame_level_dict, title="Frame-level Category Distribution")

In [None]:
draw_pie_chart(train_scene_distribution_frame_level_dict, title="Frame-level Scene Distribution")

In [None]:
draw_pie_chart(train_altitude_distribution_frame_level_dict, title="Frame-level Altitudes")

In [None]:
draw_pie_chart(train_occlusion_distribution_frame_level_dict, title="Frame-level Occlusions")

In [None]:
draw_pie_chart(train_outside_distribution_frame_level_dict, title="Frame-level Outside")

### Validation

In [None]:
val_video_sequences = 0
val_frames = 0

val_category_distribution_frame_level_dict = {}

val_sequence_length_distribution = {}

val_outside_distribution_frame_level_dict = {}
val_occlusion_distribution_frame_level_dict = {}
val_altitude_distribution_frame_level_dict = {}
val_illumination_distribution_frame_level_dict = {}
val_scene_distribution_frame_level_dict = {}

In [None]:
dataset_split_list = get_dataset_split_list(os.path.join(dataset_root_dir,'val.txt'))
for idx, xml_dict, xml_path in annot_dict_generator(dataset_root_dir, dataset_split_list):
    # if count >= 30:
    #     break # for debug only
    try:
        # frames number update---------------------------------------------------------------------------------
        val_video_sequences += 1
        frames = count_frames_per_sequence(xml_dict)
        val_frames += frames
        val_sequence_length_distribution[f'{idx}'] = frames
        # frame-level category distribution--------------------------------------------------------------------
        category_count_frame_level_dict = count_category_occurrences_frame_level_per_sequence(xml_dict)
        val_category_distribution_frame_level_dict = merge_dicts(
            val_category_distribution_frame_level_dict,
            category_count_frame_level_dict
        )
        # frame-level attribute distribution--------------------------------------------------------------------
        outside_distribution_frame_level_dict, occlusion_distribution_frame_level_dict, altitude_distribution_frame_level_dict, illumination_distribution_frame_level_dict, scene_distribution_frame_level_dict = count_attribute_occurrence_frame_level_per_sequence(
            xml_dict)
    except Exception as e:
        print("\033[1;31merror\033[0m")  # print bold "error" in red
        traceback.print_exc()
        print(f"xml file:{xml_path}")

    val_outside_distribution_frame_level_dict = merge_dicts(val_outside_distribution_frame_level_dict,
                                                              outside_distribution_frame_level_dict)
    val_occlusion_distribution_frame_level_dict = merge_dicts(val_occlusion_distribution_frame_level_dict,
                                                                occlusion_distribution_frame_level_dict)
    val_altitude_distribution_frame_level_dict = merge_dicts(val_altitude_distribution_frame_level_dict,
                                                               altitude_distribution_frame_level_dict)
    val_illumination_distribution_frame_level_dict = merge_dicts(val_illumination_distribution_frame_level_dict,
                                                                   illumination_distribution_frame_level_dict)
    val_scene_distribution_frame_level_dict = merge_dicts(val_scene_distribution_frame_level_dict,
                                                            scene_distribution_frame_level_dict)

In [None]:
val_video_sequences

In [None]:
val_frames

In [None]:
val_sequence_length_distribution

In [None]:
val_category_distribution_frame_level_dict

In [None]:
val_altitude_distribution_frame_level_dict

In [None]:
val_illumination_distribution_frame_level_dict

In [None]:
val_occlusion_distribution_frame_level_dict

In [None]:
val_outside_distribution_frame_level_dict

In [None]:
val_scene_distribution_frame_level_dict

In [None]:
draw_bar_chart(val_sequence_length_distribution, title="Sequence Length Distribution")

In [None]:
draw_pie_chart(val_category_distribution_frame_level_dict, title="Frame-level Category Distribution")

In [None]:
draw_pie_chart(val_scene_distribution_frame_level_dict, title="Frame-level Scene Distribution")

In [None]:
draw_pie_chart(val_altitude_distribution_frame_level_dict, title="Frame-level Altitudes")

In [None]:
draw_pie_chart(val_occlusion_distribution_frame_level_dict, title="Frame-level Occlusions")

In [None]:
draw_pie_chart(val_outside_distribution_frame_level_dict, title="Frame-level Outside")

### Test

In [None]:
test_video_sequences = 0
test_frames = 0

test_category_distribution_frame_level_dict = {}

test_sequence_length_distribution = {}

test_outside_distribution_frame_level_dict = {}
test_occlusion_distribution_frame_level_dict = {}
test_altitude_distribution_frame_level_dict = {}
test_illumination_distribution_frame_level_dict = {}
test_scene_distribution_frame_level_dict = {}

In [None]:
dataset_split_list = get_dataset_split_list(os.path.join(dataset_root_dir,'test.txt'))
for idx, xml_dict, xml_path in annot_dict_generator(dataset_root_dir, dataset_split_list):
    # if count >= 30:
    #     break # for debug only
    try:
        # frames number update---------------------------------------------------------------------------------
        test_video_sequences += 1
        frames = count_frames_per_sequence(xml_dict)
        test_frames += frames
        test_sequence_length_distribution[f'{idx}'] = frames
        # frame-level category distribution--------------------------------------------------------------------
        category_count_frame_level_dict = count_category_occurrences_frame_level_per_sequence(xml_dict)
        test_category_distribution_frame_level_dict = merge_dicts(
            test_category_distribution_frame_level_dict,
            category_count_frame_level_dict
        )
        # frame-level attribute distribution--------------------------------------------------------------------
        outside_distribution_frame_level_dict, occlusion_distribution_frame_level_dict, altitude_distribution_frame_level_dict, illumination_distribution_frame_level_dict, scene_distribution_frame_level_dict = count_attribute_occurrence_frame_level_per_sequence(
            xml_dict)
    except Exception as e:
        print("\033[1;31merror\033[0m")  # print bold "error" in red
        traceback.print_exc()
        print(f"xml file:{xml_path}")

    test_outside_distribution_frame_level_dict = merge_dicts(test_outside_distribution_frame_level_dict,
                                                              outside_distribution_frame_level_dict)
    test_occlusion_distribution_frame_level_dict = merge_dicts(test_occlusion_distribution_frame_level_dict,
                                                                occlusion_distribution_frame_level_dict)
    test_altitude_distribution_frame_level_dict = merge_dicts(test_altitude_distribution_frame_level_dict,
                                                               altitude_distribution_frame_level_dict)
    test_illumination_distribution_frame_level_dict = merge_dicts(test_illumination_distribution_frame_level_dict,
                                                                   illumination_distribution_frame_level_dict)
    test_scene_distribution_frame_level_dict = merge_dicts(test_scene_distribution_frame_level_dict,
                                                            scene_distribution_frame_level_dict)

In [None]:
test_video_sequences

In [None]:
test_frames

In [None]:
test_sequence_length_distribution

In [None]:
test_category_distribution_frame_level_dict

In [None]:
test_altitude_distribution_frame_level_dict

In [None]:
test_illumination_distribution_frame_level_dict

In [None]:
test_occlusion_distribution_frame_level_dict

In [None]:
test_outside_distribution_frame_level_dict

In [None]:
test_scene_distribution_frame_level_dict

In [None]:
draw_bar_chart(test_sequence_length_distribution, title="Sequence Length Distribution")

In [None]:
draw_pie_chart(test_category_distribution_frame_level_dict, title="Frame-level Category Distribution")

In [None]:
draw_pie_chart(test_scene_distribution_frame_level_dict, title="Frame-level Scene Distribution")

In [None]:
draw_pie_chart(test_altitude_distribution_frame_level_dict, title="Frame-level Altitudes")

In [None]:
draw_pie_chart(test_occlusion_distribution_frame_level_dict, title="Frame-level Occlusions")

In [None]:
draw_pie_chart(test_outside_distribution_frame_level_dict, title="Frame-level Outside")

### Overall

In [None]:
frames_ratio_dict = {'train': train_frames, 'val': val_frames, 'test': test_frames}

In [None]:
sequences_ratio_dict = {'train': train_video_sequences, 'val': val_video_sequences, 'test': test_video_sequences}

In [None]:
draw_pie_chart(frames_ratio_dict, 'Frames Proportion')

In [None]:
draw_pie_chart(sequences_ratio_dict, 'Sequences Proportion')