In [1]:
from pathlib import Path
import json

In [2]:
dataset_dir = Path("..")

# setup paths
dataset_dir = Path("/mnt/ssd2/xin/repo/DART/Liebherr_Product")

# Define the images directory and duplicates directory using Path objects
image_dir = dataset_dir / "images"
meta_dir = dataset_dir / "metadata"
label_dir = dataset_dir / "labels"
yolo_dir = dataset_dir / "yolo"
yolo_labels_dir = yolo_dir / "labels"
yolo_labels_dir.mkdir(exist_ok=True, parents=True)


# List all objects in the image directory
objs = sorted([obj.name for obj in image_dir.iterdir()])

with open(meta_dir / "id_to_name.json", "r") as f:
    id_to_name = json.load(f)

with open(meta_dir / "near_duplicates.json", "r") as f:
    near_duplicates = json.load(f)

with open(label_dir / "labels_nms.json", "r") as f:
    labels_nms = json.load(f)

In [3]:
with open(label_dir / "no_gpt.json", "r") as f:
    no_gpt = json.load(f)

In [4]:
with open(meta_dir / "classes.json", "r") as f:
    class_dict = json.load(f)

for k, v in class_dict.items():
    print(f"  {k}: {v}")

  articulated dump truck: 0
  bulldozer: 1
  combined piling and drilling rig: 2
  crawler crane: 3
  crawler excavator: 4
  crawler loader: 5
  duty cycle crane: 6
  gantry crane: 7
  log loader: 8
  maritime crane: 9
  material handling machine: 10
  mining bulldozer: 11
  mining excavator: 12
  mining truck: 13
  mobile crane: 14
  pipelayer: 15
  pontoon excavator: 16
  reachstacker: 17
  telescopic handler: 18
  tower crane: 19
  truck mixer: 20
  wheel excavator: 21
  wheel loader: 22


# split the data

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

SEED = 42

In [6]:
def delete_not_approved(X_train, X_val, X_test, no_gpt):
    """
    delete files not approved by gpt from the list no_gpt
    """
    X_train_array = np.array(X_train)
    X_val_array = np.array(X_val)
    X_test_array = np.array(X_test)
    no_gpt_array = np.array(no_gpt)
    mask_train = np.isin(X_train_array, no_gpt_array)
    mask_val = np.isin(X_val_array, no_gpt_array)
    mask_test = np.isin(X_test_array, no_gpt_array)

    X_train_gpt = X_train_array[~mask_train].tolist()
    X_val_gpt = X_val_array[~mask_val].tolist()
    X_test_gpt = X_test_array[~mask_test].tolist()

    return X_train_gpt, X_val_gpt, X_test_gpt

In [7]:
def stratified_split(
    image_list,
    y_list,
    no_list,
    train_ratio=0.6,
    val_ratio=0.2,
    test_ratio=0.2,
    seed=SEED,
):
    # Convert lists to numpy arrays for easy indexing
    image_array = np.array(image_list)
    y_array = np.array(y_list)

    if no_list is None:
        remaining_images = image_array
        remaining_labels = y_array
        special_images = np.array([])
        special_labels = np.array([])
    else:
        # Identify indices of special images
        no_indices = np.isin(image_array, no_list)

        # Separate special images and their labels
        special_images = image_array[no_indices]
        special_labels = y_array[no_indices]

        # Remaining images and their labels
        remaining_images = image_array[~no_indices]
        remaining_labels = y_array[~no_indices]

    # Calculate the number of samples in each set
    total_samples = len(image_array)
    num_special_images = len(special_images)
    num_remaining_samples = total_samples - num_special_images

    # Adjust train size to include the special images
    adjusted_train_ratio = train_ratio - (num_special_images / total_samples)
    train_size = int(adjusted_train_ratio * num_remaining_samples)
    # test_size = int(test_ratio * total_samples)
    # val_size = num_remaining_samples - train_size - test_size
    val_size = int(val_ratio * num_remaining_samples)
    test_size = int(test_ratio * num_remaining_samples)

    # Split the remaining data into train, val, and test sets in a stratified manner
    # # method 1
    # X_train, X_temp, y_train, y_temp = train_test_split(remaining_images, remaining_labels, test_size=test_size+val_size, stratify=remaining_labels, random_state=seed)
    # X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_size, stratify=y_temp, random_state=seed)
    # # method 2
    X_temp, X_test, y_temp, y_test = train_test_split(
        remaining_images,
        remaining_labels,
        test_size=test_size,
        stratify=remaining_labels,
        random_state=seed,
    )
    if val_ratio == 0:
        X_train = X_temp
        y_train = y_temp
        X_val = np.empty(0)
        y_val = np.empty(0)
    else:
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_size, stratify=y_temp, random_state=seed
        )

    # Combine the train set with the special images
    X_train = np.concatenate((X_train, special_images))
    y_train = np.concatenate((y_train, special_labels))

    return (
        X_train.tolist(),
        X_val.tolist(),
        X_test.tolist(),
        y_train.tolist(),
        y_val.tolist(),
        y_test.tolist(),
    )

In [8]:
near_duplicates_list = [f[:5] for f in near_duplicates]

In [9]:
ids = []
id_to_y = {}
for id, ann in labels_nms.items():
    if ann["boxes"] == []:  # Skip images with no annotations
        continue
    obj = id_to_name[id + ".jpg"].split("/")[0]
    ids.append(id)
    id_to_y[id] = class_dict[obj]
ys = [id_to_y[id] for id in ids]

## 80/20

### stratified, dedup

In [10]:
X_train, X_val, X_test, y_train, y_val, y_test = stratified_split(
    ids, ys, None, 0.64, 0.16, 0.2
)

## if in near_duplicates_list move to X_train
for id in X_val:
    if id in near_duplicates_list:
        X_train.append(id)
        X_val.remove(id)
for id in X_test:
    if id in near_duplicates_list:
        X_train.append(id)
        X_test.remove(id)

print(len(X_train), len(X_val), len(X_test))

9878 2384 3008


In [12]:
train_list = [
    f"./images/{id_to_name[id+'.jpg'].split('/')[0]}/{id}.jpg" for id in X_train
]
val_list = [f"./images/{id_to_name[id+'.jpg'].split('/')[0]}/{id}.jpg" for id in X_val]
test_list = [
    f"./images/{id_to_name[id+'.jpg'].split('/')[0]}/{id}.jpg" for id in X_test
]

train_file = yolo_dir / "train.txt"
val_file = yolo_dir / "val.txt"
trainval_file = yolo_dir / "trainval.txt"
test_file = yolo_dir / "test.txt"


with open(train_file, "w") as f:
    for item in train_list:
        f.write("%s\n" % item)

with open(val_file, "w") as f:
    for item in val_list:
        f.write("%s\n" % item)

with open(trainval_file, "w") as f:
    for item in train_list + val_list:
        f.write("%s\n" % item)

with open(test_file, "w") as f:
    for item in test_list:
        f.write("%s\n" % item)

#### gpt guided

In [10]:
X_train, X_val, X_test, y_train, y_val, y_test = stratified_split(
    ids, ys, None, 0.64, 0.16, 0.2
)
X_train_gpt, X_val_gpt, X_test_gpt = delete_not_approved(X_train, X_val, X_test, no_gpt)

In [11]:
train_list = [
    f"./images/{id_to_name[id+'.jpg'].split('/')[0]}/{id}.jpg" for id in X_train_gpt
]
val_list = [
    f"./images/{id_to_name[id+'.jpg'].split('/')[0]}/{id}.jpg" for id in X_val_gpt
]
test_list = [
    f"./images/{id_to_name[id+'.jpg'].split('/')[0]}/{id}.jpg" for id in X_test_gpt
]

train_file = yolo_dir / "train_gpt.txt"
val_file = yolo_dir / "val_gpt.txt"
trainval_file = yolo_dir / "trainval_gpt.txt"
test_file = yolo_dir / "test_gpt.txt"


with open(train_file, "w") as f:
    for item in train_list:
        f.write("%s\n" % item)

with open(val_file, "w") as f:
    for item in val_list:
        f.write("%s\n" % item)

with open(trainval_file, "w") as f:
    for item in train_list + val_list:
        f.write("%s\n" % item)

with open(test_file, "w") as f:
    for item in test_list:
        f.write("%s\n" % item)