# 统计标签信息

In [1]:
import os
import pandas as pd
from tqdm import tqdm

root_dir = '/data/hjl/data'
label_file = 'wsi_label_info-20240905.xlsx'
label_path = os.path.join(root_dir, label_file)

# 读取xlsx文件
df = pd.read_excel(label_path)

In [2]:
# print(df['wsi_label'])
print(df['wsi_label'].value_counts())
print('########################################')

feat_dir = '/data/hjl/data/ori_img_feats/embed6/'
count = 0
for index, wsi in tqdm(df.iterrows()): 
    if not os.path.exists(os.path.join(feat_dir, wsi['wsi_id'].replace('.pt', ''))):
        count += 1
        # print(wsi['wsi_id'])
        df = df.drop(index)

print(count)
print(df['wsi_label'].value_counts())
print('########################################')

df = df.drop_duplicates(subset='wsi_id')
print(df['wsi_label'].value_counts())




wsi_label
NILM            30561
ASC-US           7928
LSIL             5472
BV               3980
ASC-H            1905
M                1903
HSIL             1391
T                 614
AGC               538
ASCH               28
E                  18
Actinomyces         8
NILM-atrophy        4
FXJ                 3
炎症                  2
LSIL+BV             2
ASC-h               2
ASC-US+BV           2
H                   2
？                   1
ASC-H+BV            1
HSV                 1
SCC                 1
Name: count, dtype: int64
########################################


54367it [19:21, 46.80it/s]  

904
wsi_label
NILM            30164
ASC-US           7751
LSIL             5371
BV               3933
M                1874
ASC-H            1851
HSIL             1342
T                 602
AGC               529
E                  17
Actinomyces         8
NILM-atrophy        4
炎症                  2
LSIL+BV             2
ASC-h               2
ASC-US+BV           2
ASCH                2
H                   2
？                   1
HSV                 1
ASC-H+BV            1
FXJ                 1
SCC                 1
Name: count, dtype: int64
########################################
wsi_label
NILM            29454
ASC-US           5905
LSIL             4433
BV               3782
M                1803
ASC-H            1493
HSIL             1195
T                 587
AGC               497
E                  17
Actinomyces         8
NILM-atrophy        4
ASC-US+BV           2
炎症                  2
LSIL+BV             1
FXJ                 1
ASC-H+BV            1
ASC-h               1
？      




# 构造子样本标签

In [3]:
import numpy as np
np.random.seed(42)

root_dir = '/data/hjl/data'
sample_num = 2000
output_file = f'wsi_label_{sample_num}.csv'
output_path = os.path.join(root_dir, output_file)
# sample_df = df.sample(n=sample_num, replace=False)
# sample_df.to_csv(output_path, index=False)
discard_wsi_list = ['CX20193918-ASC-US.pt', 'T2303300641.pt']

value_list = ['NILM', 'ASC-US', 'LSIL', 'ASC-H', 'HSIL', 'AGC', 'T', 'M', 'BV']
specific_value = 'NILM-atrophy'  
specific_samples = df[df['wsi_label'] == specific_value]

# 计算特定值样本的数量
specific_samples_count = specific_samples.shape[0]

# 然后从剩余的数据中随机选择 2000 - specific_samples_count 个样本
remaining_samples_count = 2000 - specific_samples_count

df = df[~df['wsi_id'].isin(discard_wsi_list)]
remaining_samples = df[df['wsi_label'].isin(value_list)].sample(n=remaining_samples_count, replace=False)

# 将特定值样本和随机选择的样本合并
combined_samples = pd.concat([specific_samples, remaining_samples])

# 再次保存到CSV文件
combined_samples.to_csv(output_path, index=False)



In [4]:
print(combined_samples['wsi_label'].value_counts())
unique_samples = combined_samples.drop_duplicates()
print('########')
unique_samples['wsi_label'].value_counts()

wsi_label
NILM            1203
ASC-US           209
BV               177
LSIL             171
M                 86
ASC-H             59
HSIL              38
T                 32
AGC               21
NILM-atrophy       4
Name: count, dtype: int64
########


wsi_label
NILM            1203
ASC-US           209
BV               177
LSIL             171
M                 86
ASC-H             59
HSIL              38
T                 32
AGC               21
NILM-atrophy       4
Name: count, dtype: int64

In [5]:
import os
import pandas as pd

root_dir = '/data/hjl/data'
label_file = 'wsi_label_2000.csv'
label_path = os.path.join(root_dir, label_file)

# 读取xlsx文件
df = pd.read_csv(label_path)

print(df['wsi_label'].describe())
print(df['wsi_label'].value_counts())


count     2000
unique      10
top       NILM
freq      1203
Name: wsi_label, dtype: object
wsi_label
NILM            1203
ASC-US           209
BV               177
LSIL             171
M                 86
ASC-H             59
HSIL              38
T                 32
AGC               21
NILM-atrophy       4
Name: count, dtype: int64


## 匹配WSI位置

In [6]:
import os, glob
import pandas as pd
from tqdm import tqdm

root_dir = '/data/hjl/data'
label_file = 'wsi_label_2000.csv'
label_path = os.path.join(root_dir, label_file)
sample_num = 2000
output_file = f'wsi_label_{sample_num}_path.csv'
output_path = os.path.join(root_dir, output_file)
wsi_root_dir = '/data2/px_data_lake/TCT_POS_NEG_DATA3'

# 读取xlsx文件
df = pd.read_csv(label_path)
wsi_paths = []
for index, wsi in tqdm(df.iterrows()): 
    wsi_name = wsi['wsi_id'].replace('.pt', '')
    wsi_path = glob.glob(os.path.join(wsi_root_dir, '*', '*', wsi_name), recursive=True)
    if len(wsi_path) == 0:
        print(f'No matching WSI found for {wsi_name}')
        wsi_paths.append('flag')
    else:
        wsi_paths.append(wsi_path[0]) 
df['wsi_path'] = wsi_paths
df.to_csv(output_path, index=False)

2000it [00:08, 239.15it/s]


In [7]:
import os, glob
import pandas as pd
from tqdm import tqdm

feat_dir = '/data/hjl/data/ori_img_feats/embed6/'
root_dir = '/data/hjl/data'
label_file = 'wsi_label_2000.csv'
label_path = os.path.join(root_dir, label_file)
df = pd.read_csv(label_path)
count = 0
for index, wsi in tqdm(df.iterrows()): 
    wsi_id = wsi['wsi_id'].replace('.pt', '')
    feat_path = glob.glob(os.path.join(feat_dir, wsi_id))
    if len(feat_path) == 0:
        print(f'No matching WSI found for {wsi_id}')
        count += 1
print(count)


2000it [00:00, 20395.10it/s]

0





# 统计图片信息

In [8]:
import os, glob
import pandas as pd
from tqdm import tqdm

root_dir = '/data/hjl/data'
label_file = 'wsi_label_2000_path.csv'
label_path = os.path.join(root_dir, label_file)
# sample_num = 2000
output_file = f'wsi_label_2000_imgpath.csv'
output_path = os.path.join(root_dir, output_file)
# wsi_root_dir = '/data2/px_data_lake/TCT_POS_NEG_DATA3'

# 读取xlsx文件
df = pd.read_csv(label_path)
wsi_paths = df['wsi_path']
wsi_img_paths = []
count = 0
print(len(wsi_paths))
for wsi_path in tqdm(wsi_paths):
    tmp_path1 = os.path.join(wsi_path, 'TCT', 'OriginalImage')
    tmp_path2 = os.path.join(wsi_path, 'DigitalSlice', 'OriginalImage')
    tmp_path3 = os.path.join(wsi_path, 'OriginalImage')
    tmp_path4 = 'flag'
    flag = False
    if os.path.exists(tmp_path1):
        wsi_img_paths.append(tmp_path1)
    elif os.path.exists(tmp_path2):
        wsi_img_paths.append(tmp_path2)
    elif os.path.exists(tmp_path3):
        wsi_img_paths.append(tmp_path3)
    else:
        for filename in os.listdir(wsi_path):
            if filename.endswith('.jpg'):
                flag = True
        if flag:
            wsi_img_paths.append(wsi_path)
        else:
            wsi_img_paths.append(tmp_path4)
            print(wsi_path)
            count += 1
print(count)
df['wsi_img_path'] = wsi_img_paths
df.to_csv(output_path, index=False)

# df['wsi_path'] = wsi_paths
# df.to_csv(output_path, index=False)

2000


100%|██████████| 2000/2000 [00:05<00:00, 358.20it/s]

0





# 创建数据集软连接

In [9]:
import os
import pandas as pd

output_dir = '/data/hjl/data/TCTGC-2000'
csv_file = '/data/hjl/data/wsi_label_2000_imgpath.csv'
df = pd.read_csv(csv_file)

for index, row in df.iterrows():
    wsi_name = row['wsi_id'].replace('.pt', '')
    wsi_img_path = row['wsi_img_path']
    os.symlink(wsi_img_path, os.path.join(output_dir, wsi_name))

