## Setup

In [None]:
# Import libraries
import hashlib
import os
from pathlib import Path

import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm

In [None]:
# Define data path and class map
data_dir = os.path.join("..", "..", "datasets", "coco4000_nolarge_subclass")
class_map = {"0": "0", "15": "1"}

In [None]:
# Clear cache
for p in Path(data_dir).rglob("*.cache"):
    os.remove(p)

In [None]:
# Get all .txt or .jpg files
def get_files(type="labels"):
    match type:
        case "labels":
            return list(Path(data_dir).rglob("*.txt"))
        case "images":
            return list(Path(data_dir).rglob("*.jpg"))

In [None]:
# Get information about each annotation
def get_metadata():

    # initialise output
    ann_meta = []

    # loop over all label files
    for txt_file in tqdm(get_files(), desc="Get Metadata"):

        # open file
        with open(txt_file, "r") as f:
            lines = f.readlines()

        # loop over all lines in file
        for line in lines:

            # split out label components
            parts = line.split()
            class_id = parts.pop(0)
            x = [float(s) for s in parts[0::2]]
            y = [float(s) for s in parts[1::2]]
            ann_meta.append(
                (str(txt_file), class_id, (max(x) - min(x)), (max(y) - min(y)))
            )

    # convert to dataframe
    ann_meta = pd.DataFrame(ann_meta, columns=["filepath", "class", "w", "h"])
    ann_meta["a"] = ann_meta["w"] * ann_meta["h"]
    ann_meta["split"] = ann_meta["filepath"].map(lambda x: x.split("/")[5][:-4])
    ann_meta["filenames"] = ann_meta["filepath"].map(lambda x: x.split("/")[-1])

    # identify classes in each file
    file_meta = (
        ann_meta.groupby("filepath")["class"]
        .unique()
        .map(lambda x: ",".join([str(s) for s in sorted(x)]))
    )

    return ann_meta, file_meta

## Dataset changes

In [None]:
# # Remove images with large objects in them
# ann_meta, file_meta = get_metadata()
# for p in tqdm(list(set(ann_meta.loc[ann_meta["a"]>1/9, "filepath"])), desc="Remove large objects"):
#     os.remove(p.replace("labels","images").replace("txt","jpg"))
#     os.remove(p)

In [None]:
# # Subset classes and remap ids

# # loop over all label files
# for txt_file in tqdm(get_files()):

#     # initialise output
#     filtered_lines = []

#     # open file
#     with open(txt_file, "r") as f:
#         lines = f.readlines()

#     # loop over all lines in file
#     for line in lines:

#         # split out label components
#         parts = line.split()

#         # ignore undesirable classes and remap
#         if parts[0] in class_map:
#             parts[0] = str(class_map[parts[0]])
#             filtered_lines.append(" ".join(parts) + "\n")

#     # write back to disk
#     if filtered_lines:
#         with open(txt_file, "w") as f:
#             f.writelines(filtered_lines)
#     else:
#         os.remove(txt_file)

In [None]:
# # Downsample popular classes

# # get metadata
# ann_meta, file_meta = get_metadata()

# # remove 95% of images that only contain people
# for p in tqdm(file_meta.loc[file_meta=="0"].index.tolist()):
#     if "train" in p and int(hashlib.sha256(p.encode("utf-8")).hexdigest(), 16)%20>0:
#         os.remove(p.replace("labels","images").replace("txt","jpg"))
#         os.remove(p)

# # remove 95% of images that only contain background
# background_path = list({str(s) for s in  get_files(type="images")} - {p.replace("labels","images").replace("txt","jpg") for p in file_meta.index})
# for p in tqdm(background_path):
#     if "train" in p and int(hashlib.sha256(p.encode("utf-8")).hexdigest(), 16)%20>0:
#         os.remove(p)

In [None]:
# Plot bounding box proportion distribution
ann_meta, file_meta = get_metadata()
sns.ecdfplot(
    data=ann_meta,
    x="a",
    hue="class",
)
display(ann_meta["class"].value_counts())
display(file_meta.value_counts())
print(len(get_files(type="images")))