# 生成w3标注

In [14]:
import os
import random

DATA_ROOT = "D:/Dataset/"

# input_dir = DATA_ROOT+'TCT-NGC-2023/base-data/'
input_dir = DATA_ROOT+'TCT-NGC-2023/'
output_dir = DATA_ROOT+'TCT-NGC-2023/annotations/w3/'

# NILM对应1，POSITIVE对应0
sub_dirs = ['Unannotated_KSJ/Unannotated-KSJ-TCTNGC-NILM/', 
         'Unannotated_KSJ/Unannotated-KSJ-TCTNGC-POS/',
         'Unannotated_XIMEA/Unannotated-XIMEA-TCTNGC-NILM/',
         'Unannotated_XIMEA/Unannotated-XIMEA-TCTNGC-POS/']
labels = [1,
          0,
          1,
          0]


def get_all_wsi(input_dir, sub_dirs, labels):
    """get all wsi names, not directories"""
    wsi_set = set()
    for i, sub_dir in enumerate(sub_dirs):
        for wsi_dir in os.listdir(input_dir+sub_dir):
            item = (wsi_dir, labels[i])
            wsi_set.add(item)

    return wsi_set

def split_w3_dataset(wsi_set, output_dir):
    """split wsi_set and save to annotation files"""
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 打乱图片文件列表
    wsi_set = list(wsi_set)
    random.shuffle(wsi_set)
 
    # 计算训练、验证和测试集的数量
    num_wsi = len(wsi_set)
    num_train = int(0.8 * num_wsi)
    num_val = int(0.1 * num_wsi)
    
    # 划分数据集并保存到相应文件
    with open(os.path.join(output_dir, 'train.txt'), 'w') as train_file:
        for wsi_name, category in wsi_set[:num_train]:
            train_file.write(f"{wsi_name} {category}\n")

    with open(os.path.join(output_dir, 'val.txt'), 'w') as val_file:
        for wsi_name, category in wsi_set[num_train:num_train + num_val]:
            val_file.write(f"{wsi_name} {category}\n")

    with open(os.path.join(output_dir, 'test.txt'), 'w') as test_file:
        for wsi_name, category in wsi_set[num_train + num_val:]:
            test_file.write(f"{wsi_name} {category}\n")
    print("W3 for TCT-NGC-2023 split and saved successfully.")

def main():
    # 获取所有wsi的唯一标识
    wsi_set = get_all_wsi(input_dir, sub_dirs, labels)
    # 随机划分
    split_w3_dataset(wsi_set, output_dir)

main()

W3 for TCT-NGC-2023 split and saved successfully.


# 查看w3标注信息

In [2]:
import os
import numpy as np

DATA_ROOT = "D:/Dataset/"

ann_dir = DATA_ROOT+'TCT-NGC-2023/annotations/w3/'
files=['train.txt', 'val.txt', 'test.txt']

def info_ann(file_name):
    print(f"{file_name}中的各类WSI数量如下：")
    items = []
    with open(ann_dir+file_name, 'r') as file:
        for line in file:
            items.append(line.strip().split())
    
    labels = [int(data[1]) for data in items]
    max_label = max(labels)
    counts = np.zeros(max_label+1, dtype=np.uint16)
    
    # 统计标注为 0 和 1 的个数
    for item in items:
        label = int(item[1])
        counts[label] += 1

    # 输出统计结果
    for i, count in enumerate(counts):
        print('类别{}: {:>5}'.format(i, count))
    print('全部 : {:>5}'.format(sum(counts)))
    return sum(counts)

def main():
    sum_all = 0    
    for file in files:
        sum_all += info_ann(file)
    print("  ")
    print('总和 : {:>5}'.format(sum_all))
main()    

train.txt中的各类WSI数量如下：
类别0:   363
类别1:   389
全部 :   752
val.txt中的各类WSI数量如下：
类别0:    53
类别1:    41
全部 :    94
test.txt中的各类WSI数量如下：
类别0:    46
类别1:    48
全部 :    94
  
总和 :   940


# 构建w1数据集标注

In [9]:
import os

# **一般只需要修改这个 就可以在不同机器上运行
# **所有路径都不要使用单个反斜杠
# Benign 对应 1，malignant 对应 0
DATA_ROOT = "D:/Dataset/"

input_dir = DATA_ROOT+'TCT-NGC-2023/'
w3_dir = DATA_ROOT+'TCT-NGC-2023/annotations/w3/'
output_dir = DATA_ROOT+'TCT-NGC-2023/annotations/w1/'

# NILM对应1，POSITIVE对应0
sub_dirs = ['Unannotated_KSJ/Unannotated-KSJ-TCTNGC-NILM/', 
         'Unannotated_KSJ/Unannotated-KSJ-TCTNGC-POS/',
         'Unannotated_XIMEA/Unannotated-XIMEA-TCTNGC-NILM/',
         'Unannotated_XIMEA/Unannotated-XIMEA-TCTNGC-POS/']
labels = [1,
          0,
          1,
          0]

ann_files=['train.txt', 'val.txt', 'test.txt']


def write_files_to_file(root_dir, dir, label, output_dir, file):
    os.makedirs(output_dir, exist_ok=True)
    image_files = list()
    for img in os.listdir(dir):
        image_files.append((os.path.relpath(dir + img, root_dir), label))
        
    with open(os.path.join(output_dir, file), 'w') as w_file:
        for image_file, category in image_files:
            w_file.write(f"{image_file} {category}\n")

def main():
    # 读取w3标注的三个子集分别的信息
    # 下面两个list中的元素都为三个list 分别对应train val test
    wsi_partitions = list()
    for file_name in ann_files:
        wsi_part = list()
        wsi_label = list()
        # 读取一个文件 比如'train.txt'
        with open(os.path.join(w3_dir, file_name)) as file:
            for line in file:
                line = line.split()
                wsi_part.append(line[0])
                wsi_label.append(line[1])
        wsi_partitions.append(wsi_part)

    # 将三个标注集对应的图片集合路径加入w1标注集
    for i, sub_dir in enumerate(sub_dirs):
        for wsi_name in os.listdir(input_dir + sub_dir):
            if wsi_name in wsi_part[0]:
                write_files_to_file(input_dir, input_dir + sub_dir + wsi_name, labels[i], output_dir, 'train.txt')
            elif wsi_name in wsi_part[1]:
                write_files_to_file(input_dir, input_dir + sub_dir + wsi_name, labels[i], output_dir, 'val.txt')
            elif wsi_name in wsi_part[2]:
                write_files_to_file(input_dir, input_dir + sub_dir + wsi_name, labels[i], output_dir, 'test.txt')

main()


# 查看w1标注信息

In [11]:
import os
import numpy as np

DATA_ROOT = "D:/Dataset/"

ann_dir = DATA_ROOT+'TCT-NGC-2023/annotations/w1/'
files=['train.txt', 'val.txt', 'test.txt']

def info_ann(file_name):
    print(f"{file_name}中的各类WSI数量如下：")
    items = []
    with open(ann_dir+file_name, 'r') as file:
        for line in file:
            items.append(line.strip().split())
    
    labels = [int(data[1]) for data in items]
    max_label = max(labels)
    counts = np.zeros(max_label+1, dtype=np.uint32)
    
    # 统计标注为 0 和 1 的个数
    for item in items:
        label = int(item[1])
        counts[label] += 1

    # 输出统计结果
    for i, count in enumerate(counts):
        print('类别{}: {:>5}'.format(i, count))
    print('全部 : {:>5}'.format(sum(counts)))
    return sum(counts)

def main():
    sum_all = 0    
    for file in files:
        sum_all += info_ann(file)
    print("  ")
    print('总和 : {:>5}'.format(sum_all))
main()    

train.txt中的各类WSI数量如下：
类别0:   532
全部 :   532
val.txt中的各类WSI数量如下：
类别0:     0
类别1:   513
全部 :   513
test.txt中的各类WSI数量如下：


ValueError: max() arg is an empty sequence