# 划分数据集并重新创建一个目录存储

In [3]:
import os
import random
import shutil

# 设置目录路径
DATA_PATH = "D:/Project/biomed-clip-puNCE/output/FNAC"
OUTPUT_PATH = "..\\..\\output\\FNAC-split"

# 设置划分比例
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# 创建保存划分结果的目录
train_dir = os.path.join(OUTPUT_PATH, "train")
val_dir = os.path.join(OUTPUT_PATH, "val")
test_dir = os.path.join(OUTPUT_PATH, "test")

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# 获取子目录列表
subdirectories = ["B", "M"]

# 遍历每个子目录
for subdirectory in subdirectories:
    subdir_path = os.path.join(DATA_PATH, subdirectory)
    
    # 获取子目录下的所有文件
    file_list = os.listdir(subdir_path)
    random.shuffle(file_list)
    
    # 计算划分边界
    num_files = len(file_list)
    num_train = int(train_ratio * num_files)
    num_val = int(val_ratio * num_files)
    
    # 划分文件并复制到对应的目录
    train_files = file_list[:num_train]
    val_files = file_list[num_train:num_train+num_val]
    test_files = file_list[num_train+num_val:]
    
    for file in train_files:
        src_path = os.path.join(subdir_path, file)
        dst_path = os.path.join(train_dir, subdirectory, file)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)
        
    for file in val_files:
        src_path = os.path.join(subdir_path, file)
        dst_path = os.path.join(val_dir, subdirectory, file)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)
        
    for file in test_files:
        src_path = os.path.join(subdir_path, file)
        dst_path = os.path.join(test_dir, subdirectory, file)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copy(src_path, dst_path)


# 生成W3标注文件

In [3]:
import os
import random
import shutil
import json

# **一般只需要修改这个 就可以在不同机器上运行
# **所有路径都不要使用单个反斜杠
# Benign 对应 1，malignant 对应 0

DATA_ROOT = "D:/Dataset/"
config_name = '../../settings/environment.json'
if os.path.exists(config_name):
    with open(config_name, 'r') as file:
        info = json.load(file)
        DATA_ROOT = info["FNAC_DATA_ROOT"]

input_dir = DATA_ROOT+'FNAC-CROP/base-data/'
output_dir = DATA_ROOT+'FNAC-CROP/annotations/w3/'

def get_all_wsi(input_dir):
    """获取所有wsi路径"""
    wsi_set = set()
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            # 获取所有图片的上一层路径名称
            category = 1 if os.path.basename(os.path.dirname(root))=="B" else 0
            wsi_name = os.path.basename(root)
            wsi_set.add((wsi_name, category))

def split_w3_dataset(input_dir, output_dir):
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 获取所有wsi文件的相对路径
    wsi_set = get_all_wsi(input_dir)

    # 打乱图片文件列表
    wsi_set = list(wsi_set)
    random.shuffle(wsi_set)
 
    # 计算训练、验证和测试集的数量
    num_wsi = len(wsi_set)
    num_train = int(0.8 * num_wsi)
    num_val = int(0.1 * num_wsi)

    # 划分数据集并保存到相应文件
    with open(os.path.join(output_dir, 'train.txt'), 'w') as train_file:
        for wsi_name, category in wsi_set[:num_train]:
            train_file.write(f"{wsi_name} {category}\n")

    with open(os.path.join(output_dir, 'val.txt'), 'w') as val_file:
        for wsi_name, category in wsi_set[num_train:num_train + num_val]:
            val_file.write(f"{wsi_name} {category}\n")

    with open(os.path.join(output_dir, 'test.txt'), 'w') as test_file:
        for wsi_name, category in wsi_set[num_train + num_val:]:
            test_file.write(f"{wsi_name} {category}\n")
    print("W3 for FNAC-CROP split and saved successfully.")

# split_w3_dataset(input_dir, output_dir)


# 统计w3标注信息

In [4]:
import os
import numpy as np

DATA_ROOT = "D:/Dataset/"
config_name = '../../settings/environment.json'
if os.path.exists(config_name):
    with open(config_name, 'r') as file:
        info = json.load(file)
        DATA_ROOT = info["FNAC_DATA_ROOT"]

ann_dir = DATA_ROOT+'FNAC-CROP/annotations/w3/'
files=['train.txt', 'val.txt', 'test.txt']

def info_ann(file_name):
    print(f"{file_name}中的各类WSI数量如下：")
    items = []
    with open(ann_dir+file_name, 'r') as file:
        for line in file:
            items.append(line.strip().split())
    
    labels = [int(data[1]) for data in items]
    max_label = max(labels)
    counts = np.zeros(max_label+1, dtype=np.uint16)
    
    # 统计标注为 0 和 1 的个数
    for item in items:
        label = int(item[1])
        counts[label] += 1

    # 输出统计结果
    for i, count in enumerate(counts):
        print('类别{}: {:>5}'.format(i, count))
    print('全部 : {:>5}'.format(sum(counts)))
    return sum(counts)

def main():
    sum_all = 0    
    for file in files:
        sum_all += info_ann(file)
    print("  ")
    print('总和 : {:>5}'.format(sum_all))
    
main()

train.txt中的各类WSI数量如下：
类别0:    91
类别1:    78
全部 :   169
val.txt中的各类WSI数量如下：
类别0:    11
类别1:    10
全部 :    21
test.txt中的各类WSI数量如下：
类别0:    11
类别1:    11
全部 :    22
  
总和 :   212


# 生成W1标注文件

In [5]:
import os

# **所有路径都不要使用单个反斜杠
# Benign 对应 1，malignant 对应 0
DATA_ROOT = "D:/Dataset/"
config_name = '../../settings/environment.json'
if os.path.exists(config_name):
    with open(config_name, 'r') as file:
        info = json.load(file)
        DATA_ROOT = info["FNAC_DATA_ROOT"]
        
input_dir = DATA_ROOT+'FNAC-CROP/base-data/'
w3_dir = DATA_ROOT+'FNAC-CROP/annotations/w3/'
output_dir = DATA_ROOT+'FNAC-CROP/annotations/w1/'

files=['train.txt', 'val.txt', 'test.txt']

def w1_split_dataset(input_dir, w3_dir, file_name, output_dir):
    # 根据w3(wsi数据)划分w1(patch数据)
    os.makedirs(output_dir, exist_ok=True)    
    w3_ann_file = w3_dir + file_name

    items=[]
    with open(w3_ann_file, 'r') as file:
        for line in file:
            items.append(line)
    wsis = [item.split(" ")[0] for item in items]
    labels = [item.split(" ")[1] for item in items]
    
    wsi_dirs = []
    wsi_labels = []
    image_files = []
    # 找到对应wsi的目录
    for root, dirs, files in os.walk(input_dir):
        for dir in dirs:
            if dir in wsis:
                wsi_labels.append(labels[wsis.index(dir)])
                wsi_dirs.append(os.path.join(root, dir))
    
    # 将图片读取
    for i, dir in enumerate(wsi_dirs):
        image_list = os.listdir(dir)
        for file in image_list:
            if file.lower().endswith('.jpg') or file.lower().endswith('.png'):
                # 确定图片类别，假设子目录名称为类别名称
                category = int(wsi_labels[i])
                image_path = os.path.relpath(os.path.join(dir, file), input_dir)
                image_files.append((image_path, category))

    # 保存到相应文件
    with open(os.path.join(output_dir, file_name), 'w') as train_file:
        for image_file, category in image_files:
            train_file.write(f"{image_file} {category}\n")

for file_name in files:
    w1_split_dataset(input_dir, w3_dir, file_name, output_dir)


# 统计w1标注信息

In [6]:
import os
import numpy as np

DATA_ROOT = "D:/Dataset/"
config_name = '../../settings/environment.json'
if os.path.exists(config_name):
    with open(config_name, 'r') as file:
        info = json.load(file)
        DATA_ROOT = info["FNAC_DATA_ROOT"]
        
ann_dir = DATA_ROOT+'FNAC-CROP/annotations/w1/'
files=['train.txt', 'val.txt', 'test.txt']

def info_ann(file_name):
    print(f"{file_name}中的各类WSI数量如下：")
    items = []
    with open(ann_dir+file_name, 'r') as file:
        for line in file:
            items.append(line.strip().split())
    
    labels = [int(data[1]) for data in items]
    max_label = max(labels)
    counts = np.zeros(max_label+1, dtype=np.uint16)
    
    # 统计标注为 0 和 1 的个数
    for item in items:
        label = int(item[1])
        counts[label] += 1

    # 输出统计结果
    for i, count in enumerate(counts):
        print('类别{}: {:>6}'.format(i, count))
    print('全部 : {:>6}'.format(sum(counts)))
    return sum(counts)

def main():
    sum_all = 0    
    for file in files:
        sum_all += info_ann(file)
    print("  ")
    print('总和 : {:>6}'.format(sum_all))
    
main()

train.txt中的各类WSI数量如下：
类别0:  15015
类别1:  12870
全部 :  27885
val.txt中的各类WSI数量如下：
类别0:   1815
类别1:   1650
全部 :   3465
test.txt中的各类WSI数量如下：
类别0:   1815
类别1:   1815
全部 :   3630
  
总和 :  34980
