# 统计检测特征数据

In [6]:
import glob, os
from tqdm import tqdm
import pandas as pd

label_path = '/data/hjl/data/wsi_label_2000.csv'
input_dir = '/data/hjl/data/ori_img_feats/embed6'
output_dir = '/data/hjl/data/TCTGC-2000-Lesion'
wsi_names = os.listdir(input_dir)

df = pd.read_csv(label_path)
label_wsis = df['wsi_id'].tolist()
label_wsis = [wsi.replace('.pt', '') for wsi in label_wsis]
print(label_wsis)


for label_wsi in tqdm(label_wsis):
    input_path = os.path.join(input_dir, label_wsi)
    output_path = os.path.join(output_dir, label_wsi)
    if os.path.exists(output_path):
        continue
    os.makedirs(output_path, exist_ok=True)
    patches = dict()
    npys = os.listdir(input_path)
    for npy in npys:
        feat_name = os.path.basename(npy)
        base, extension = os.path.splitext(feat_name)
        patch_name = base.split('-')[-1]
        if patch_name not in patches:
            patches[patch_name] = []
        patches[patch_name].append(feat_name)
    for patch_name in patches:
        with open(os.path.join(output_path, patch_name + '.txt'), 'w') as f:
            f.write('\n'.join(patches[patch_name]))

print(len(patches))


['L2001876', 'L2001934', 'L2001972', 'L2001995', 'C02S2647', 'T-SZ230214031', 'T2307040262', 'T2309220210', 'C02S3084', 'T220716045', 'T220302122', 'XY73804109CIN3', 'T2211150425', 'T2211070784', 'T-SZ2308210037', 'T-SZ230312019', 'XY0965', 'chuxiong20231215_T-3', 'CX20133414', '002907781', 'A0327509', 'T2308020530', '191800429701-LSIL', 'T-SZ230328011', 'L2009661', '115574250-2-1', 'chuxiong20240116_231-2705-US', 'chuxiong20240112_241-0037', 'T220112279', 'Tm23067288', 'T221210285', 'T230112212', 'CX20161539-LSIL', 'T2209170611', 'xCR20018898-LSIL', 'T2307200764', 'xCY20007899-M', 'CX20171827-LSIL', 'T2308110977', 'S0272337-28000033', 'L2011205', 'xCR20018912-ASC-H', 'T-SZ230208001', 'HBSY-15-T20-04531', 'T-SZ2309050086', 'CX20171007-ASC-US', 'T2303170321', 'C02S2663', 'CX20103956', 'L2003371', 'T220115228', 'T220624006-NILM', 'T2303190239', 'T2306181046', 'T2308220693', 'CX20180941-NILM', 'XY09059029CIN1', 'chongqingfy_20231213_23033297-2', 'fengcheng_22-NILM', 'T220119287', 'T230112

100%|██████████| 2000/2000 [00:31<00:00, 63.57it/s] 

453





# Patch dataset

In [4]:
import os
import numpy as np
import torch

root_dir = '/data/hjl/data/ori_img_feats/embed6'
patch2lesion_path = '/data/hjl/data/TCTGC-2000-Lesion'
wsi_name = '4CX21211225-NILM'
wsi_path = os.path.join(patch2lesion_path, wsi_name)

wsi_feat = []
patch_names = os.listdir(wsi_path)
for patch_name in patch_names:
    with open(os.path.join(wsi_path, patch_name), 'r') as f:
        patch_feat = []
        for line in f:
            lesion_name = line.strip()
            patch_feat.append(np.load(os.path.join(root_dir, wsi_name, lesion_name)))
        patch_feat = torch.tensor(patch_feat)
    # print(patch_feat.shape)
    wsi_feat.append(patch_feat)
len(wsi_feat)

328

In [4]:
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import os, torch
import numpy as np

class WSI_Lesion_Feat(Dataset):
    # 以WSI为基本单位  以两层list的形式返回所有lesion的feature [patch,patch,...] patch:[lesion, lesion]
    def __init__(self, root_dir, patch2lesion_path, label_path):
        self.root_dir = root_dir 
        self.patch2lesion_path = patch2lesion_path
        self.label_path = label_path
        df = pd.read_csv(label_path)
        self.wsi_names = [wsi_name.replace('.pt', '') for wsi_name in df['wsi_id'].tolist()]
        self.labels = df['wsi_label'].tolist()
        
        # sort to ensure reproducibility
    def __getitem__(self, idx):
        root_dir = self.root_dir
        patch2lesion_path = self.patch2lesion_path
        wsi_name = self.wsi_names[idx]
        label = self.labels[idx]
        wsi_path = os.path.join(patch2lesion_path, wsi_name)
        
        wsi_feat = []
        patch_names = os.listdir(wsi_path)
        for patch_name in patch_names:
            with open(os.path.join(wsi_path, patch_name), 'r') as f:
                patch_feat = []
                for line in f:
                    lesion_name = line.strip()
                    patch_feat.append(np.load(os.path.join(root_dir, wsi_name, lesion_name)))
                patch_feat = torch.tensor(patch_feat)
            wsi_feat.append(patch_feat)
        return wsi_feat, label
    
    def __len__(self):
        return len(self.wsi_names)

root_dir = '/data/hjl/data/ori_img_feats/embed6'
patch2lesion_path = '/data/hjl/data/TCTGC-2000-Lesion'
label_path = '/data/hjl/data/wsi_label_2000.csv'
dataset = WSI_Lesion_Feat(root_dir, patch2lesion_path, label_path)
print(len(dataset))
wsi_feat, label = dataset[0]
print(wsi_feat[15].shape)

2000
torch.Size([1, 256])


  patch_feat = torch.tensor(patch_feat)


In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_sequence

lengths_sorted, perm_idx = torch.sort(wsi_feat, descending=True)

# 然后，根据排序后的索引对序列进行重新排序
sequences_sorted = [wsi_feat[idx] for idx in perm_idx]

packed_tensor = pack_sequence(sequences_sorted)
packed_tensor.shape

TypeError: sort() received an invalid combination of arguments - got (list, descending=bool), but expected one of:
 * (Tensor input, *, bool stable, int dim, bool descending, tuple of Tensors out)
 * (Tensor input, int dim, bool descending, *, tuple of Tensors out)
 * (Tensor input, *, bool stable, name dim, bool descending, tuple of Tensors out)
 * (Tensor input, name dim, bool descending, *, tuple of Tensors out)


In [15]:
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_sequence, unpack_sequence

# 假设我们有一些序列，它们的长度列表不是降序排列的
lengths = [3, 2, 5]  # 对应于序列 [3, 2, 5]
sequences = [torch.randn(l, 5) for l in lengths]

# 首先，确保长度列表是降序排列的
lengths_sorted, perm_idx = torch.sort(torch.tensor(lengths), descending=True)

# 然后，根据排序后的索引对序列进行重新排序
sequences_sorted = [sequences[idx] for idx in perm_idx]
print(sequences_sorted)

# 现在，我们可以安全地使用 pack_sequence 了
packed_sequence = pack_sequence(sequences_sorted)

# packed_sequence 是一个 PackedSequence 对象
print(packed_sequence)

# 如果需要，我们可以解包 packed_sequence
unpacked_sequences = unpack_sequence(packed_sequence)

# unpacked_sequences 是一个张量列表，它按照原始的顺序排列
for seq in unpacked_sequences:
    print(seq)


[tensor([[-1.4805, -1.5006,  1.6302,  0.5000, -0.5761],
        [ 0.2129, -1.0223,  0.0980,  0.0045, -2.1920],
        [ 0.8251, -0.7871, -0.4727,  0.1925, -3.2518],
        [-0.5654,  0.7930, -0.6940, -0.6324, -0.0497],
        [-2.0832, -1.5861,  0.2494,  0.6472, -1.7879]]), tensor([[ 0.6918,  1.6860,  0.6797,  0.1683,  1.3757],
        [-1.3800, -0.1871,  0.7400, -2.2203, -1.2459],
        [-1.1267, -0.6937, -1.1342,  2.2594,  1.1274]]), tensor([[ 0.6033, -0.1627,  0.1167,  0.7727,  0.5980],
        [-0.7639,  0.3461,  0.1174, -0.2397,  0.3104]])]
PackedSequence(data=tensor([[-1.4805, -1.5006,  1.6302,  0.5000, -0.5761],
        [ 0.6918,  1.6860,  0.6797,  0.1683,  1.3757],
        [ 0.6033, -0.1627,  0.1167,  0.7727,  0.5980],
        [ 0.2129, -1.0223,  0.0980,  0.0045, -2.1920],
        [-1.3800, -0.1871,  0.7400, -2.2203, -1.2459],
        [-0.7639,  0.3461,  0.1174, -0.2397,  0.3104],
        [ 0.8251, -0.7871, -0.4727,  0.1925, -3.2518],
        [-1.1267, -0.6937, -1.1342,  2