In [None]:
from pathlib import Path
import yaml
dataset_path = Path(".")
labels = sorted(dataset_path.rglob("*labels/*.txt"))  # all data in 'labels'

In [None]:
yaml_file = "timmies.yaml"  # your data YAML with data directories and names dictionary
with open(yaml_file, "r", encoding="utf8") as y:
    classes = yaml.safe_load(y)["names"]
cls_idx = sorted(classes.keys())

In [None]:
import pandas as pd

index = [label.stem for label in labels]  # uses base filename as ID (no extension)
labels_df = pd.DataFrame([], columns=cls_idx, index=index)

In [None]:
from collections import Counter

for label in labels:
    lbl_counter = Counter()

    with open(label, "r") as lf:
        lines = lf.readlines()

    for line in lines:
        # classes for YOLO label uses integer at first position of each line
        lbl_counter[int(line.split(" ")[0])] += 1

    labels_df.loc[label.stem] = lbl_counter

labels_df = labels_df.fillna(0.0)  # replace `nan` values with `0.0`

In [None]:
labels_df

K-Fold Cross Validation

In [None]:
from sklearn.model_selection import KFold

ksplit = 2
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20)  # setting random_state for repeatable results

kfolds = list(kf.split(labels_df))

In [None]:
folds = [f"split_{n}" for n in range(1, ksplit + 1)]
folds_df = pd.DataFrame(index=index, columns=folds)

for i, (train, val) in enumerate(kfolds, start=1):
    folds_df[f"split_{i}"].loc[labels_df.iloc[train].index] = "train"
    folds_df[f"split_{i}"].loc[labels_df.iloc[val].index] = "val"

In [None]:
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)

for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
    train_totals = labels_df.iloc[train_indices].sum()
    val_totals = labels_df.iloc[val_indices].sum()

    # To avoid division by zero, we add a small value (1E-7) to the denominator
    ratio = val_totals / (train_totals + 1e-7)
    fold_lbl_distrb.loc[f"split_{n}"] = ratio

In [None]:
import datetime

supported_extensions = [".jpg", ".jpeg", ".png"]

# Initialize an empty list to store image file paths
images = []

# Loop through supported extensions and gather image files
for ext in supported_extensions:
    images.extend(sorted((dataset_path / "images").rglob(f"*{ext}")))

# Create the necessary directories and dataset YAML files (unchanged)
save_path = Path(dataset_path / f"{datetime.date.today().isoformat()}_{ksplit}-Fold_Cross-val")
save_path.mkdir(parents=True, exist_ok=True)
ds_yamls = []

for split in folds_df.columns:
    # Create directories
    split_dir = save_path / split
    split_dir.mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "labels").mkdir(parents=True, exist_ok=True)
    (split_dir / "val" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "val" / "labels").mkdir(parents=True, exist_ok=True)

    # Create dataset YAML files
    dataset_yaml = split_dir / f"{split}_dataset.yaml"
    ds_yamls.append(dataset_yaml)

    with open(dataset_yaml, "w") as ds_y:
        yaml.safe_dump(
            {
                "path": split_dir.as_posix(),
                "train": "train",
                "val": "val",
                "names": classes,
            },
            ds_y,
        )

In [None]:
import shutil

for image, label in zip(images, labels):
    for split, k_split in folds_df.loc[image.stem].items():
        # Destination directory
        img_to_path = save_path / split / k_split / "images"
        lbl_to_path = save_path / split / k_split / "labels"

        # Copy image and label files to new directory (SamefileError if file already exists)
        shutil.copy(image, img_to_path / image.name)
        shutil.copy(label, lbl_to_path / label.name)

In [None]:
folds_df.to_csv(save_path / "kfold_datasplit.csv")
fold_lbl_distrb.to_csv(save_path / "kfold_label_distribution.csv")

In [None]:
from ultralytics.data.utils import autosplit

autosplit(
    path="paper cup.v5i.yolov8/train/images",
    weights=(1, 0, 0.0),
    annotated_only=False,
)

In [None]:
import os
import imagehash
from PIL import Image

# Set folder path
folder_path = "paper cup.v5i.yolov8/train/images"
hash_size = 8  # Hash size (higher value increases accuracy but reduces speed)
similarity_threshold = 5  # Lower means stricter matching

# Dictionary to store image hashes
hash_dict = {}

def get_image_hash(image_path):
    try:
        with Image.open(image_path) as img:
            return imagehash.average_hash(img, hash_size=hash_size)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Process images
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    if os.path.isfile(file_path) and filename.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif')):
        img_hash = get_image_hash(file_path)
        
        if img_hash:
            # Check if similar image exists
            for stored_hash, stored_path in hash_dict.items():
                if img_hash - stored_hash < similarity_threshold:  # Hamming distance check
                    print(f"Deleting duplicate: {file_path} (similar to {stored_path})")
                    os.remove(file_path)
                    break
            else:
                hash_dict[img_hash] = file_path  # Store new hash if not similar

print("Duplicate removal complete.")


In [None]:
import os

# Set folder paths
jpg_folder = "paper cup.v5i.yolov8/train/images"  # Folder containing .jpg images
txt_folder = "paper cup.v5i.yolov8/train/labels"  # Folder containing .txt files

# Get all jpg filenames (without extensions)
jpg_files = {os.path.splitext(f)[0] for f in os.listdir(jpg_folder) if f.lower().endswith('.jpg')}

# Process text files
for txt_file in os.listdir(txt_folder):
    if txt_file.endswith('.txt'):
        txt_name = os.path.splitext(txt_file)[0]  # Remove .txt extension
        
        # If no matching jpg exists, delete txt file
        txt_path = os.path.join(txt_folder, txt_file)
        if txt_name not in jpg_files:
            print(f"Deleting: {txt_path}")
            os.remove(txt_path)

print("Cleanup complete!")
