## BUSI

In [7]:
import os
import shutil
from pathlib import Path

# === CONFIGURATION ===
input_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/Breast-BUSI/train")
output_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/semi_seg/BUSI")
imgs_dir = output_dir / "imgs"
masks_dir = output_dir / "masks"
output_txt = output_dir / "semi.txt"
samples_per_class = 40  # Number of samples to select per class (ordered by number)

# === CREATE OUTPUT FOLDERS ===
imgs_dir.mkdir(parents=True, exist_ok=True)
masks_dir.mkdir(parents=True, exist_ok=True)

# === SCAN INPUT IMAGES ===
def extract_number(filename):
    # Extract the number from the filename like benign_(2).png
    return int(filename.split("(")[-1].split(")")[0])

all_imgs = [f for f in os.listdir(input_dir / "imgs") if f.endswith(".png")]

# Group filenames by class, then sort each group by the extracted number
class_dict = {}
for fname in all_imgs:
    class_name = fname.split("_")[0]
    class_dict.setdefault(class_name, []).append(fname)

# Sort each class group by the number
for class_name in class_dict:
    class_dict[class_name] = sorted(class_dict[class_name], key=extract_number)

# === SELECT TOP-N IMAGES PER CLASS ===
selected_files = []
for class_name, files in class_dict.items():
    sampled = files[:samples_per_class]  # Take first N files after sorting by number
    selected_files.extend(sampled)

# === COPY SELECTED IMAGES AND MASKS TO OUTPUT FOLDERS ===
with open(output_txt, "w") as f:
    for fname in selected_files:
        src_img = input_dir / "imgs" / fname
        dst_img = imgs_dir / fname
        shutil.copy(src_img, dst_img)

        src_mask = input_dir / "masks" / fname
        dst_mask = masks_dir / fname
        shutil.copy(src_mask, dst_mask)

        f.write(f"BUSI/imgs/{fname}\n")

print(f"✅ Done! Copied {len(selected_files)} files to {output_dir}")


✅ Done! Copied 80 files to /mnt/HDD1/tuong/TRUST/dataset/semi_seg/BUSI


## UCLM

In [6]:
import os
import shutil
from pathlib import Path

# === CONFIGURATION ===
input_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/Breast-UCLM/train")
output_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/semi_seg/UCLM")
imgs_dir = output_dir / "imgs"
masks_dir = output_dir / "masks"
output_txt = output_dir / "semi.txt"
samples_per_class = 40  # Select top-N per class

# === CREATE OUTPUT FOLDERS ===
imgs_dir.mkdir(parents=True, exist_ok=True)
masks_dir.mkdir(parents=True, exist_ok=True)

# === Extract number from UCLM filenames ===
def extract_number(filename):
    return int(filename.split("_")[-1].replace(".png", "").lstrip("0") or "0")

# === SCAN INPUT IMAGES ===
all_imgs = [f for f in os.listdir(input_dir / "imgs") if f.endswith(".png")]

# === GROUP FILENAMES BY CLASS ===
class_dict = {}
for fname in all_imgs:
    class_name = fname.split("_")[0]  # "benign", "malignant"
    class_dict.setdefault(class_name, []).append(fname)

# === SORT EACH CLASS BY EXTRACTED NUMBER ===
for class_name in class_dict:
    class_dict[class_name] = sorted(class_dict[class_name], key=extract_number)

# === SELECT TOP-N IMAGES PER CLASS (SEQUENTIAL ORDER, BENIGN FIRST) ===
selected_files = []
# Force 'benign' to be the first processed class
for class_name in sorted(class_dict.keys(), key=lambda x: 0 if x == "benign" else 1):
    files = class_dict[class_name]
    sampled = files[:samples_per_class]
    selected_files.extend(sampled)

# === COPY SELECTED IMAGES AND MASKS ===
with open(output_txt, "w") as f:
    for fname in selected_files:
        src_img = input_dir / "imgs" / fname
        dst_img = imgs_dir / fname
        shutil.copy(src_img, dst_img)

        src_mask = input_dir / "masks" / fname
        dst_mask = masks_dir / fname
        shutil.copy(src_mask, dst_mask)

        f.write(f"UCLM/imgs/{fname}\n")

print(f"✅ Done! Copied {len(selected_files)} files to {output_dir}")


✅ Done! Copied 80 files to /mnt/HDD1/tuong/TRUST/dataset/semi_seg/UCLM


## UDIAT

In [1]:
import os
import shutil
from pathlib import Path

# === CONFIGURATION ===
input_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/Breast-UDIAT/train")
output_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/semi_seg/UDIAT")
imgs_dir = output_dir / "imgs"
masks_dir = output_dir / "masks"
output_txt = output_dir / "semi.txt"
samples_per_class = 40  # Select top-N per class

# === CREATE OUTPUT FOLDERS ===
imgs_dir.mkdir(parents=True, exist_ok=True)
masks_dir.mkdir(parents=True, exist_ok=True)

# === Extract number from UDIAT filenames ===
def extract_number(filename):
    return int(filename.split("_")[-1].replace(".png", "").lstrip("0") or "0")

# === SCAN INPUT IMAGES ===
all_imgs = [f for f in os.listdir(input_dir / "imgs") if f.endswith(".png")]

# === GROUP FILENAMES BY CLASS ===
class_dict = {}
for fname in all_imgs:
    class_name = fname.split("_")[0]  # "benign", "malignant"
    class_dict.setdefault(class_name, []).append(fname)

# === SORT EACH CLASS BY EXTRACTED NUMBER ===
for class_name in class_dict:
    class_dict[class_name] = sorted(class_dict[class_name], key=extract_number)

# === SELECT TOP-N IMAGES PER CLASS (SEQUENTIAL ORDER, BENIGN FIRST) ===
selected_files = []
# Force 'benign' to be the first processed class
for class_name in sorted(class_dict.keys(), key=lambda x: 0 if x == "benign" else 1):
    files = class_dict[class_name]
    sampled = files[:samples_per_class]
    selected_files.extend(sampled)

# === COPY SELECTED IMAGES AND MASKS ===
with open(output_txt, "w") as f:
    for fname in selected_files:
        src_img = input_dir / "imgs" / fname
        dst_img = imgs_dir / fname
        shutil.copy(src_img, dst_img)

        src_mask = input_dir / "masks" / fname
        dst_mask = masks_dir / fname
        shutil.copy(src_mask, dst_mask)

        f.write(f"UDIAT/imgs/{fname}\n")

print(f"✅ Done! Copied {len(selected_files)} files to {output_dir}")


✅ Done! Copied 80 files to /mnt/HDD1/tuong/TRUST/dataset/semi_seg/UDIAT


## BUSBRA

In [7]:
import os
import shutil
from pathlib import Path

# === CONFIGURATION ===
input_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/Breast-BUSBRA/train")
output_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/semi/BUSBRA")
imgs_dir = output_dir / "imgs"
masks_dir = output_dir / "masks"
output_txt = output_dir / "semi.txt"
samples_per_class = 20  # Select top-N per class

# === CREATE OUTPUT FOLDERS ===
imgs_dir.mkdir(parents=True, exist_ok=True)
masks_dir.mkdir(parents=True, exist_ok=True)

# === Extract number from BUSBRA filenames ===
def extract_number(filename):
    return int(filename.split("_")[-1].split("-")[0].replace(".png", "").lstrip("0") or "0")

# === SCAN INPUT IMAGES ===
all_imgs = [f for f in os.listdir(input_dir / "imgs") if f.endswith(".png")]

# === GROUP FILENAMES BY CLASS ===
class_dict = {}
for fname in all_imgs:
    class_name = fname.split("_")[0]  # "benign", "malignant"
    class_dict.setdefault(class_name, []).append(fname)

# === SORT EACH CLASS BY EXTRACTED NUMBER ===
for class_name in class_dict:
    class_dict[class_name] = sorted(class_dict[class_name], key=extract_number)

# === SELECT TOP-N IMAGES PER CLASS (SEQUENTIAL ORDER, BENIGN FIRST) ===
selected_files = []
# Force 'benign' to be the first processed class
for class_name in sorted(class_dict.keys(), key=lambda x: 0 if x == "benign" else 1):
    files = class_dict[class_name]
    sampled = files[:samples_per_class]
    selected_files.extend(sampled)

# === COPY SELECTED IMAGES AND MASKS ===
with open(output_txt, "w") as f:
    for fname in selected_files:
        src_img = input_dir / "imgs" / fname
        dst_img = imgs_dir / fname
        shutil.copy(src_img, dst_img)

        src_mask = input_dir / "masks" / fname
        dst_mask = masks_dir / fname
        shutil.copy(src_mask, dst_mask)

        f.write(f"BUSBRA/imgs/{fname}\n")

print(f"✅ Done! Copied {len(selected_files)} files to {output_dir}")


✅ Done! Copied 40 files to /mnt/HDD1/tuong/TRUST/dataset/semi/BUSBRA


## OASBUD

In [5]:
import os
import shutil
from pathlib import Path

# === CONFIGURATION ===
input_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/Breast-OASBUD/train")
output_dir = Path("/mnt/HDD1/tuong/TRUST/dataset/semi/OASBUD")
imgs_dir = output_dir / "imgs"
masks_dir = output_dir / "masks"
output_txt = output_dir / "semi.txt"
samples_per_class = 20  # Select top-N per class

# === CREATE OUTPUT FOLDERS ===
imgs_dir.mkdir(parents=True, exist_ok=True)
masks_dir.mkdir(parents=True, exist_ok=True)

# === Extract number from OASBUD filenames ===
def extract_number(filename):
    return int(filename.split("_")[-1].replace(".png", "").lstrip("0") or "0")

# === SCAN INPUT IMAGES ===
all_imgs = [f for f in os.listdir(input_dir / "imgs") if f.endswith(".png")]

# === GROUP FILENAMES BY CLASS ===
class_dict = {}
for fname in all_imgs:
    class_name = fname.split("_")[0]  # "benign", "malignant"
    class_dict.setdefault(class_name, []).append(fname)

# === SORT EACH CLASS BY EXTRACTED NUMBER ===
for class_name in class_dict:
    class_dict[class_name] = sorted(class_dict[class_name], key=extract_number)

# === SELECT TOP-N IMAGES PER CLASS (SEQUENTIAL ORDER, BENIGN FIRST) ===
selected_files = []
# Force 'benign' to be the first processed class
for class_name in sorted(class_dict.keys(), key=lambda x: 0 if x == "benign" else 1):
    files = class_dict[class_name]
    sampled = files[:samples_per_class]
    selected_files.extend(sampled)

# === COPY SELECTED IMAGES AND MASKS ===
with open(output_txt, "w") as f:
    for fname in selected_files:
        src_img = input_dir / "imgs" / fname
        dst_img = imgs_dir / fname
        shutil.copy(src_img, dst_img)

        src_mask = input_dir / "masks" / fname
        dst_mask = masks_dir / fname
        shutil.copy(src_mask, dst_mask)

        f.write(f"OASBUD/imgs/{fname}\n")

print(f"✅ Done! Copied {len(selected_files)} files to {output_dir}")


✅ Done! Copied 40 files to /mnt/HDD1/tuong/TRUST/dataset/semi/OASBUD
