# 划分数据集并重新创建一个目录存储

In [3]:
import os
import random
import shutil

# 设置目录路径
DATA_PATH = "D:/Project/biomed-clip-puNCE/output/FNAC"
OUTPUT_PATH = "..\\..\\output\\FNAC-split"

# 设置划分比例
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# 创建保存划分结果的目录
train_dir = os.path.join(OUTPUT_PATH, "train")
val_dir = os.path.join(OUTPUT_PATH, "val")
test_dir = os.path.join(OUTPUT_PATH, "test")

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# 获取子目录列表
subdirectories = ["B", "M"]

# 遍历每个子目录
for subdirectory in subdirectories:
    subdir_path = os.path.join(DATA_PATH, subdirectory)
    
    # 获取子目录下的所有文件
    file_list = os.listdir(subdir_path)
    random.shuffle(file_list)
    
    # 计算划分边界
    num_files = len(file_list)
    num_train = int(train_ratio * num_files)
    num_val = int(val_ratio * num_files)
    
    # 划分文件并复制到对应的目录
    train_files = file_list[:num_train]
    val_files = file_list[num_train:num_train+num_val]
    test_files = file_list[num_train+num_val:]
    
    for file in train_files:
        src_path = os.path.join(subdir_path, file)
        dst_path = os.path.join(train_dir, subdirectory, file)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)
        
    for file in val_files:
        src_path = os.path.join(subdir_path, file)
        dst_path = os.path.join(val_dir, subdirectory, file)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)
        
    for file in test_files:
        src_path = os.path.join(subdir_path, file)
        dst_path = os.path.join(test_dir, subdirectory, file)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)


# 生成W1标注文件

In [7]:
import os
import random

# 输入目录
input_dir = r'D:/Project/biomed-clip-puNCE/output/FNAC'
# 输出目录
output_dir = r'D:/Project/biomed-clip-puNCE/output/FNAC/annotations/w1'

def split_dataset(input_dir, output_dir):
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 获取所有图片文件的相对路径
    image_files = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.lower().endswith('.jpg') or file.lower().endswith('.png'):
                # 确定图片类别，假设子目录名称为类别名称
                category = 1 if "B" in root else 0
                image_path = os.path.relpath(os.path.join(root, file), input_dir)
                image_files.append((image_path, category))

    # 打乱图片文件列表
    random.shuffle(image_files)

    # 计算训练、验证和测试集的数量
    num_images = len(image_files)
    num_train = int(0.8 * num_images)
    num_val = int(0.1 * num_images)

    # 划分数据集并保存到相应文件
    with open(os.path.join(output_dir, 'train.txt'), 'w') as train_file:
        for image_file, category in image_files[:num_train]:
            train_file.write(f"{image_file} {category}\n")

    with open(os.path.join(output_dir, 'val.txt'), 'w') as val_file:
        for image_file, category in image_files[num_train:num_train + num_val]:
            val_file.write(f"{image_file} {category}\n")

    with open(os.path.join(output_dir, 'test.txt'), 'w') as test_file:
        for image_file, category in image_files[num_train + num_val:]:
            test_file.write(f"{image_file} {category}\n")

    print("Dataset split and saved successfully.")


split_dataset(input_dir, output_dir)


Dataset split and saved successfully.


# 生成w2标注文件

In [None]:
import os
import random

def split_dataset(input_dir, output_dir):
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 获取所有图片文件的相对路径
    image_files = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.lower().endswith('.jpg') or file.lower().endswith('.png'):
                # 确定图片类别，假设子目录名称为类别名称
                category = 1 if "M" in root else 0
                image_path = os.path.relpath(os.path.join(root, file), input_dir)
                image_files.append((image_path, category))

    # 打乱图片文件列表
    random.shuffle(image_files)

    # 计算训练、验证和测试集的数量
    num_images = len(image_files)
    num_train = int(0.8 * num_images)
    num_val = int(0.1 * num_images)

    # 划分数据集并保存到相应文件
    with open(os.path.join(output_dir, 'train.txt'), 'w') as train_file:
        for image_file, category in image_files[:num_train]:
            train_file.write(f"{image_file} {category}\n")

    with open(os.path.join(output_dir, 'val.txt'), 'w') as val_file:
        for image_file, category in image_files[num_train:num_train + num_val]:
            val_file.write(f"{image_file} {category}\n")

    with open(os.path.join(output_dir, 'test.txt'), 'w') as test_file:
        for image_file, category in image_files[num_train + num_val:]:
            test_file.write(f"{image_file} {category}\n")

    print("Dataset split and saved successfully.")

# 输入目录
input_dir = r'D:/Project/biomed-clip-puNCE/output/FNAC'
# 输出目录
output_dir = r'D:/Project/biomed-clip-puNCE/output/FNAC-annotations/w1'
split_dataset(input_dir, output_dir)
