In [1]:
from pathlib import Path
import json
import shutil
import os

In [2]:
dataset_dir = Path("/mnt/ssd2/xin/repo/DART/Liebherr_Product")
repo_dir = Path("/mnt/ssd2/xin/repo/DART/diversification/dreambooth")

generated_data_dir = repo_dir / "generated_data"

# Define the images directory and duplicates directory using Path objects
image_dir = dataset_dir / "images"
meta_dir = dataset_dir / "metadata"

label_gen_dir = dataset_dir / "labels_gen"
label_orig_dir = dataset_dir / "labels"
label_background_dir = dataset_dir / "labels_background"

label_dir = dataset_dir / "labels"
yolo_dir = dataset_dir / "yolo"

label_all_dir = dataset_dir / "labels_all"
yolo_all_dir = dataset_dir / "yolo_all"


# List all objects in the image directory
objs = sorted([obj.name for obj in image_dir.iterdir() if obj.is_dir()])

with open(meta_dir / "id_to_name.json", "r") as f:
    id_to_name = json.load(f)
with open(meta_dir / "id_to_name_gen.json", "r") as f:
    id_to_name_gen = json.load(f)
with open(meta_dir / "classes.json", "r") as f:
    class_name_to_id = json.load(f)
    class_id_to_name = {v: k for k, v in class_name_to_id.items()}
with open(meta_dir / "near_duplicates.json", "r") as f:
    near_duplicates = json.load(f)

with open(label_all_dir / "labels_nms.json", "r") as f:
    labels_nms_all = json.load(f)
with open(label_orig_dir / "labels_nms.json", "r") as f:
    labels_nms_orig = json.load(f)
with open(label_gen_dir / "labels_nms.json", "r") as f:
    labels_nms_gen = json.load(f)
with open(label_background_dir / "labels_nms.json", "r") as f:
    labels_nms_background = json.load(f)

with open(label_gen_dir / "stats_obj_nms.json", "r") as f:
    stats_obj_nms_gen = json.load(f)

with open(label_all_dir / "no_ann.json", "r") as f:
    no_ann_all = json.load(f)
with open(label_orig_dir / "no_ann.json", "r") as f:
    no_ann_orig = json.load(f)
with open(label_gen_dir / "no_ann.json", "r") as f:
    no_ann_gen = json.load(f)
with open(label_background_dir / "no_ann.json", "r") as f:
    no_ann_background = json.load(f)

## filter definition

### performance obj-wise filter
a list of objcts with MAP < 0.9 for training with original data

In [3]:
include_obj_ids = [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19]
include_objs = [class_id_to_name[id] for id in include_obj_ids]
include_objs

['articulated dump truck',
 'bulldozer',
 'combined piling and drilling rig',
 'crawler crane',
 'crawler loader',
 'duty cycle crane',
 'gantry crane',
 'log loader',
 'maritime crane',
 'material handling machine',
 'mining bulldozer',
 'mining truck',
 'mobile crane',
 'pipelayer',
 'pontoon excavator',
 'reachstacker',
 'telescopic handler',
 'tower crane']

### prompt-wise filter
a list of prompts that are not suitable for training with original data

In [4]:
unsuitable_prompts = {
    "articulated dump truck": [],
    "bulldozer": [],
    "combined piling and drilling rig": [],
    "crawler crane": [],
    "crawler excavator": [],
    "crawler loader": [],
    "duty cycle crane": ["underground_construction"],
    "gantry crane": [
        "underground_construction",
        "mining_site",
        "highway_construction",
        "city_street",
        "urban_renewal",
        "urban_demolition",
        "urban_parking_lot",
        "rural_area",
        "forest_construction",
        "suburban_construction",
        "railway_construction",
        "solar_farm_construction",
        "skyscraper_construction",
        "wind_farm_construction",
        "multiple_machines_highway",
    ],
    "log loader": [],
    "maritime crane": [
        "underground_construction",
        "mining_site",
        "highway_construction",
        "city_street",
        "urban_renewal",
        "urban_demolition",
        "urban_parking_lot",
        "rural_area",
        "forest_construction",
        "suburban_construction",
        "railway_construction",
        "solar_farm_construction",
        "skyscraper_construction",
        "wind_farm_construction",
        "multiple_machines_city_street",
        "multiple_machines_highway",
    ],
    "material handling machine": [],
    "mining bulldozer": [
        "city_street",
        "highway_construction",
        "urban_renewal",
        "urban_demolition",
        "urban_parking_lot",
        "suburban_construction",
        "skyscraper_construction",
        "multiple_machines_city_street",
        "multiple_machines_highway",
    ],
    "mining excavator": [
        "city_street",
        "highway_construction",
        "urban_renewal",
        "urban_demolition",
        "urban_parking_lot",
        "suburban_construction",
        "skyscraper_construction",
        "multiple_machines_city_street",
        "multiple_machines_highway",
    ],
    "mining truck": [
        "city_street",
        "highway_construction",
        "urban_renewal",
        "urban_demolition",
        "urban_parking_lot",
        "suburban_construction",
        "skyscraper_construction",
        "multiple_machines_city_street",
        "multiple_machines_highway",
    ],
    "mobile crane": [],
    "pipelayer": [],
    "pontoon excavator": [
        "underground_construction",
        "mining_site",
        "highway_construction",
        "city_street",
        "urban_renewal",
        "urban_demolition",
        "urban_parking_lot",
        "rural_area",
        "forest_construction",
        "suburban_construction",
        "railway_construction",
        "solar_farm_construction",
        "skyscraper_construction",
        "wind_farm_construction",
        "multiple_machines_city_street",
        "multiple_machines_highway",
    ],
    "reachstacker": [],
    "telescopic handler": [],
    "tower crane": ["underground_construction"],
    "truck mixer": [],
    "wheel excavator": [],
    "wheel loader": [],
}
for obj, ls in unsuitable_prompts.items():
    print(f"{obj}: {len(ls)}")

articulated dump truck: 0
bulldozer: 0
combined piling and drilling rig: 0
crawler crane: 0
crawler excavator: 0
crawler loader: 0
duty cycle crane: 1
gantry crane: 15
log loader: 0
maritime crane: 16
material handling machine: 0
mining bulldozer: 9
mining excavator: 9
mining truck: 9
mobile crane: 0
pipelayer: 0
pontoon excavator: 16
reachstacker: 0
telescopic handler: 0
tower crane: 1
truck mixer: 0
wheel excavator: 0
wheel loader: 0


### LMM image-wise filter
a list of images not approved by lmm

In [5]:
with open(repo_dir / "no_lmm.json", "r") as f:
    no_lmm = json.load(f)
no_lmm[:5]

['d14520', 'd18678', 'd63579', 'd43780', 'd45061']

## split the data

In [6]:
import random

SEED = 0
random.seed(SEED)

# the test text files that need to be deduplicated
txt_names = [
    "test.txt",
]

### filter
with special prompts

all three filters

In [7]:
# filter 0: get rid of the images with no annotations
ids_gen = set(labels_nms_gen.keys()) - set(no_ann_gen)
ids_orig = set(labels_nms_orig.keys()) - set(no_ann_orig)
ids_background = set(labels_nms_background.keys()) - set(no_ann_background)

quota = {obj: 0 for obj in objs}
for id in ids_gen:
    obj = id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ")
    orig_stem = id_to_name_gen[id + ".jpg"].split("/")[-1]
    # performance filter
    if obj not in include_objs:
        continue
    # prompt filter
    no_promopt_keys = unsuitable_prompts[obj]
    if any([key in orig_stem for key in no_promopt_keys]):
        continue
    # lmm filter
    if id in no_lmm:
        continue
    quota[obj] += 1
quota

{'articulated dump truck': 957,
 'bulldozer': 3805,
 'combined piling and drilling rig': 2877,
 'crawler crane': 3834,
 'crawler excavator': 0,
 'crawler loader': 1885,
 'duty cycle crane': 2785,
 'gantry crane': 2460,
 'log loader': 4796,
 'maritime crane': 2999,
 'material handling machine': 3838,
 'mining bulldozer': 3752,
 'mining excavator': 0,
 'mining truck': 1920,
 'mobile crane': 1816,
 'pipelayer': 1401,
 'pontoon excavator': 1629,
 'reachstacker': 931,
 'telescopic handler': 4775,
 'tower crane': 2680,
 'truck mixer': 0,
 'wheel excavator': 0,
 'wheel loader': 0}

#### without multiplyer

corrsponds to generated-to-original data ratio 4:1 in the paper

In [8]:
yolo_filter_dir = dataset_dir / "yolo_filter"  # TODO: change the name
yolo_filter_dir.mkdir(exist_ok=True)
os.symlink(image_dir, yolo_filter_dir / "images", target_is_directory=True)
os.symlink(
    generated_data_dir, yolo_filter_dir / "generated_data", target_is_directory=True
)
os.symlink(
    yolo_all_dir / "labels", yolo_filter_dir / "labels", target_is_directory=True
)

# training set composition:
# rest of the original data, all of the background data, and {multiplier} percentatge of the generated data

# for test_txt in yolo_dir.iterdir():
for txt_name in txt_names:
    test_txt = yolo_dir / txt_name

    with open(test_txt, "r") as f:
        lines = f.readlines()
    test_ids_orig = set([line.split("/")[-1].split(".")[0] for line in lines])
    # TODO: do not consider val for now
    train_ids_orig = ids_orig - test_ids_orig
    train_list = []
    # add rest of the original data
    for id in train_ids_orig:
        obj = id_to_name[id + ".jpg"].split("/")[0]
        train_list.append(f"./images/{obj}/{id}.jpg\n")
    # add all of the background data
    for id in ids_background:
        obj = id_to_name[id + ".jpg"].split("/")[0]
        train_list.append(f"./images/{obj}/{id}.jpg\n")
    # add {multiplier} percentatge of the generated data
    for obj in objs:
        no_promopt_keys = unsuitable_prompts[obj]
        # performance filter
        if obj not in include_objs:
            continue
        ids_gen_obj = [
            id
            for id in ids_gen
            if id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ") == obj
        ]
        ids_gen_obj_filtered = []
        for id in ids_gen_obj:
            # prompt filter
            orig_stem = id_to_name_gen[id + ".jpg"].split("/")[-1]
            if any([key in orig_stem for key in no_promopt_keys]):
                continue
            # lmm filter
            if id in no_lmm:
                continue
            ids_gen_obj_filtered.append(id)
        for id in ids_gen_obj_filtered:
            obj = id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ")
            train_list.append(f"./generated_data/{obj}/{id}.jpg\n")
    # save
    shutil.copy(test_txt, yolo_filter_dir / test_txt.name)
    with open(yolo_filter_dir / test_txt.name.replace("test", "trainval"), "w") as f:
        f.writelines([f"{line}" for line in train_list])

print(f"for {test_txt.name}")
print(f"training dataset size: {len(train_list)} images")
print(f"from which {sum(quota.values())} images are generated data")

for test.txt
training dataset size: 61444 images
from which 49140 images are generated data


#### with multipliers

multipliers = [0.1,0.25,0.5,0.75] corrsponds to generated-to-original data ratio [0.5:1,1:1,2:1,3:1] in the paper

In [9]:
# use the quota for all filters but without multiplier as the baseline
quota_base = quota

# shuffle the ids_gen
# TODO: a better way to sample data
ids_gen_list = list(ids_gen)
random.shuffle(ids_gen_list)

In [10]:
# training set composition:
# rest of the original data, all of the background data, and {multiplier} percentatge of the generated data
# use {multiplier} percentatge of the data for training for each obj
multipliers = [0.1, 0.25, 0.5, 0.75]
# corrsponds to generated-to-original data ratio [0.5:1,1:1,2:1,3:1] in the paper

for multiplier in multipliers:
    yolo_filter_multiplier_dir = (
        dataset_dir / f"yolo_filter_{multiplier}"
    )  # TODO: change the name
    # resume
    if yolo_filter_multiplier_dir.exists():
        print(f"skipped {yolo_filter_multiplier_dir} since it already exists")
        continue
    yolo_filter_multiplier_dir.mkdir(exist_ok=True)
    os.symlink(
        image_dir, yolo_filter_multiplier_dir / "images", target_is_directory=True
    )
    os.symlink(
        generated_data_dir,
        yolo_filter_multiplier_dir / "generated_data",
        target_is_directory=True,
    )
    os.symlink(
        yolo_all_dir / "labels",
        yolo_filter_multiplier_dir / "labels",
        target_is_directory=True,
    )

    quota = {obj: 0 for obj in objs}
    for id in ids_gen_list:
        obj = id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ")
        orig_stem = id_to_name_gen[id + ".jpg"].split("/")[-1]
        # multiplier
        if quota[obj] >= quota_base[obj] * multiplier:
            continue
        # performance filter
        if obj not in include_objs:
            continue
        # prompt filter
        no_promopt_keys = unsuitable_prompts[obj]
        if any([key in orig_stem for key in no_promopt_keys]):
            continue
        # lmm filter
        if id in no_lmm:
            continue
        quota[obj] += 1

    # for test_txt in yolo_dir.iterdir():
    for txt_name in txt_names:
        test_txt = yolo_dir / txt_name
        with open(test_txt, "r") as f:
            lines = f.readlines()
        test_ids_orig = set([line.split("/")[-1].split(".")[0] for line in lines])
        # TODO: do not consider val for now
        train_ids_orig = ids_orig - test_ids_orig
        train_list = []
        # add rest of the original data
        for id in train_ids_orig:
            obj = id_to_name[id + ".jpg"].split("/")[0]
            train_list.append(f"./images/{obj}/{id}.jpg\n")
        # add all of the background data
        for id in ids_background:
            obj = id_to_name[id + ".jpg"].split("/")[0]
            train_list.append(f"./images/{obj}/{id}.jpg\n")
        # add {multiplier} percentatge of the generated data
        for obj in objs:
            # performance filter
            if obj not in include_objs:
                continue
            ids_gen_obj = [
                id
                for id in ids_gen
                if id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ") == obj
            ]
            ids_gen_obj_filtered = []
            for id in ids_gen_obj:
                # prompt filter
                orig_stem = id_to_name_gen[id + ".jpg"].split("/")[-1]
                if any([key in orig_stem for key in no_promopt_keys]):
                    continue
                # lmm filter
                if id in no_lmm:
                    continue
                ids_gen_obj_filtered.append(id)
            random.seed(SEED)
            ids_gen_obj_sampled = random.sample(ids_gen_obj_filtered, quota[obj])
            for id in ids_gen_obj_sampled:
                obj = id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ")
                train_list.append(f"./generated_data/{obj}/{id}.jpg\n")
        # save
        shutil.copy(test_txt, yolo_filter_multiplier_dir / test_txt.name)
        with open(
            yolo_filter_multiplier_dir / test_txt.name.replace("test", "trainval"), "w"
        ) as f:
            f.writelines([f"{line}" for line in train_list])

print(f"for {test_txt.name}")
print(f"training dataset size: {len(train_list)} images")
print(f"from which {sum(quota.values())} images are generated data")

for test.txt
training dataset size: 49164 images
from which 36860 images are generated data


### only gen

no original images

no performance filter 

In [11]:
# filter 0: get rid of the images with no annotations
ids_gen = set(labels_nms_gen.keys()) - set(no_ann_gen)
ids_orig = set(labels_nms_orig.keys()) - set(no_ann_orig)
ids_background = set(labels_nms_background.keys()) - set(no_ann_background)

quota = {obj: 0 for obj in objs}
for id in ids_gen:
    obj = id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ")
    orig_stem = id_to_name_gen[id + ".jpg"].split("/")[-1]
    # no performance filter
    # prompt filter
    no_promopt_keys = unsuitable_prompts[obj]
    if any([key in orig_stem for key in no_promopt_keys]):
        continue
    # lmm filter
    if id in no_lmm:
        continue
    quota[obj] += 1
quota

{'articulated dump truck': 957,
 'bulldozer': 3805,
 'combined piling and drilling rig': 2877,
 'crawler crane': 3834,
 'crawler excavator': 1440,
 'crawler loader': 1885,
 'duty cycle crane': 2785,
 'gantry crane': 2460,
 'log loader': 4796,
 'maritime crane': 2999,
 'material handling machine': 3838,
 'mining bulldozer': 3752,
 'mining excavator': 2704,
 'mining truck': 1920,
 'mobile crane': 1816,
 'pipelayer': 1401,
 'pontoon excavator': 1629,
 'reachstacker': 931,
 'telescopic handler': 4775,
 'tower crane': 2680,
 'truck mixer': 3835,
 'wheel excavator': 2880,
 'wheel loader': 1920}

In [12]:
sum(quota.values())

61919

#### with multipliers


In [13]:
# use the quota for all filters but without multiplier as the baseline
quota_base = quota

# shuffle the ids_gen
# TODO: a better way to sample data
ids_gen_list = list(ids_gen)
random.shuffle(ids_gen_list)

In [14]:
# training set composition:
# rest of the original data, all of the background data, and {multiplier} percentatge of the generated data
# use {multiplier} percentatge of the data for training for each obj
multipliers = [0.2, 1.0]  # 0.2 corresponds to 1:0 ratio in the paper

for multiplier in multipliers:
    yolo_filter_multiplier_dir = (
        dataset_dir / f"yolo_gen_{multiplier}"
    )  # TODO: change the name
    # resume
    if yolo_filter_multiplier_dir.exists():
        print(f"skipped {yolo_filter_multiplier_dir} since it already exists")
        continue
    yolo_filter_multiplier_dir.mkdir(exist_ok=True)
    os.symlink(
        image_dir, yolo_filter_multiplier_dir / "images", target_is_directory=True
    )
    os.symlink(
        generated_data_dir,
        yolo_filter_multiplier_dir / "generated_data",
        target_is_directory=True,
    )
    os.symlink(
        yolo_all_dir / "labels",
        yolo_filter_multiplier_dir / "labels",
        target_is_directory=True,
    )

    quota = {obj: 0 for obj in objs}
    for id in ids_gen_list:
        obj = id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ")
        orig_stem = id_to_name_gen[id + ".jpg"].split("/")[-1]
        # multiplier
        if quota[obj] >= quota_base[obj] * multiplier:
            continue
        # # performance filter
        # if obj not in include_objs:
        #     continue
        # prompt filter
        no_promopt_keys = unsuitable_prompts[obj]
        if any([key in orig_stem for key in no_promopt_keys]):
            continue
        # lmm filter
        if id in no_lmm:
            continue
        quota[obj] += 1

    # for test_txt in yolo_dir.iterdir():
    for txt_name in txt_names:
        if "gpt" in txt_name:  # no original data, so no gpt
            continue
        test_txt = yolo_dir / txt_name

        with open(test_txt, "r") as f:
            lines = f.readlines()
        test_ids_orig = set([line.split("/")[-1].split(".")[0] for line in lines])
        # TODO: do not consider val for now
        # train_ids_orig = ids_orig - test_ids_orig
        train_list = []
        # # add rest of the original data
        # for id in train_ids_orig:
        #     obj = id_to_name[id+'.jpg'].split('/')[0]
        #     train_list.append(f"./images/{obj}/{id}.jpg\n")
        # add all of the background data
        for id in ids_background:
            obj = id_to_name[id + ".jpg"].split("/")[0]
            train_list.append(f"./images/{obj}/{id}.jpg\n")
        # add {multiplier} percentatge of the generated data
        for obj in objs:
            # # performance filter
            # if obj not in include_objs:
            #     continue
            ids_gen_obj = [
                id
                for id in ids_gen
                if id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ") == obj
            ]
            ids_gen_obj_filtered = []
            for id in ids_gen_obj:
                # prompt filter
                orig_stem = id_to_name_gen[id + ".jpg"].split("/")[-1]
                if any([key in orig_stem for key in no_promopt_keys]):
                    continue
                # lmm filter
                if id in no_lmm:
                    continue
                ids_gen_obj_filtered.append(id)
            random.seed(SEED)
            ids_gen_obj_sampled = random.sample(ids_gen_obj_filtered, quota[obj])
            for id in ids_gen_obj_sampled:
                obj = id_to_name_gen[id + ".jpg"].split("/")[1].replace("_", " ")
                train_list.append(f"./generated_data/{obj}/{id}.jpg\n")
        # save
        shutil.copy(test_txt, yolo_filter_multiplier_dir / test_txt.name)
        with open(
            yolo_filter_multiplier_dir / test_txt.name.replace("test", "trainval"), "w"
        ) as f:
            f.writelines([f"{line}" for line in train_list])

print(f"for {test_txt.name}")
print(f"training dataset size: {len(train_list)} images")
print(f"from which {sum(quota.values())} images are generated data")

for test.txt
training dataset size: 61961 images
from which 61919 images are generated data
