In [48]:
import os
import pandas as pd
import numpy as np
np.random.seed(42)
root_dir = 'labels'
files = ['all_label.csv', 'train_label.csv', 'val_label.csv', 'test_label.csv']
file_paths = [os.path.join(root_dir, file) for file in files]
label_to_id = {
    'NILM': 0, 
    'ASC-US': 1,
    'LSIL': 2,
    'ASC-H': 3,
    'HSIL': 4,
    'AGC': 6,
    'T': 8,
    'M': 9,
    'BV': 10,
	}
task_id = [[1,2,3,4], [6], [8,9,10]]
task_neg_id = [0,5,7]
labels_id = []

df = pd.read_csv(file_paths[0])
df = df[df['wsi_label'] != 'NILM-atrophy']
for label in df['wsi_label']:
    labels_id.append(label_to_id[label])
df = df.drop('id', axis=1)
df['wsi_id'] = [wsi_id.replace('.pt', '') for wsi_id in df['wsi_id']]
df['wsi_label_id'] = labels_id

print(df.head())

# 将标签为0的行按照阳性数量比例划分给各个task
df_nilm = df[df['wsi_label_id'] == 0]
df = df[df['wsi_label_id'] != 0]

task_pos_num = []
for task in task_id:
    task_pos_num.append(df[df['wsi_label_id'].isin(task)]['wsi_label_id'].count())
task_pos_num /= sum(task_pos_num) 
task_neg_num = [int(p * len(df_nilm)) for p in task_pos_num]
task_neg_num[-1] = len(df_nilm) - sum(task_neg_num[:-1])

df_nilm.iloc[:task_neg_num[0], df_nilm.columns.get_loc('wsi_label_id')] = task_neg_id[0]
df_nilm.iloc[task_neg_num[0]:task_neg_num[0]+task_neg_num[1], df_nilm.columns.get_loc('wsi_label_id')] = task_neg_id[1]
df_nilm.iloc[task_neg_num[0]+task_neg_num[1]:, df_nilm.columns.get_loc('wsi_label_id')] = task_neg_id[2]
print(df_nilm['wsi_label_id'].value_counts())

df = pd.concat([df, df_nilm], ignore_index=True)

df_shuffle = df.sample(frac=1)
print(df_shuffle.head())
num = len(df_shuffle)
train_num = int(num * 0.8)
val_num = int(num * 0.1)
test_num = num - train_num - val_num

df_shuffle = df_shuffle.drop('wsi_label', axis=1)
df_train = df_shuffle[:train_num]
df_val = df_shuffle[train_num:train_num+val_num]
df_test = df_shuffle[train_num+val_num:]
dfs = [df_train, df_val, df_test]
for df in dfs:
    print('#################################')
    print(df['wsi_label_id'].value_counts())

df_train.to_csv(file_paths[1], index=False, header=False)
df_val.to_csv(file_paths[2], index=False, header=False)
df_test.to_csv(file_paths[3], index=False, header=False)

          wsi_id wsi_label  wsi_label_id
4       C02S2647      NILM             0
5  T-SZ230214031        BV            10
6    T2307040262        BV            10
7    T2309220210        BV            10
8       C02S3084      NILM             0
wsi_label_id
0    723
7    449
5     31
Name: count, dtype: int64
               wsi_id wsi_label  wsi_label_id
889     T-SZ230318013      NILM             0
1674  CX20168796-NILM      NILM             7
414        CX20105164    ASC-US             1
1427         L2012136      NILM             0
849        T220125251      NILM             0
#################################
wsi_label_id
0     575
7     349
1     164
10    148
2     141
9      72
3      50
8      30
4      28
5      24
6      15
Name: count, dtype: int64
#################################
wsi_label_id
0     70
7     60
1     21
2     16
10    13
5      5
4      5
3      4
9      4
8      1
Name: count, dtype: int64
#################################
wsi_label_id
0     78
7     40
1

In [47]:
import os
import pandas as pd
import numpy as np
np.random.seed(42)
root_dir = 'tmp-labels'
files = ['all_label.csv', 'train_label.csv', 'val_label.csv', 'test_label.csv']
file_paths = [os.path.join(root_dir, file) for file in files]
label_to_id = {
    'NILM': 0, 
    'ASC-US': 1,
    'LSIL': 1,
    'ASC-H': 1,
    'HSIL': 1,
    'AGC': 3,
    'T': 5,
    'M':6,
    'BV':7,
	}
task_id = [[1], [3], [5,6,7]]
task_neg_id = [0,2,4]
labels_id = []

df = pd.read_csv(file_paths[0])
df = df[df['wsi_label'] != 'NILM-atrophy']
for label in df['wsi_label']:
    labels_id.append(label_to_id[label])
df = df.drop('id', axis=1)
df['wsi_id'] = [wsi_id.replace('.pt', '') for wsi_id in df['wsi_id']]
df['wsi_label_id'] = labels_id

print(df.head())

# 将标签为0的行按照阳性数量比例划分给各个task
df_nilm = df[df['wsi_label_id'] == 0]
df = df[df['wsi_label_id'] != 0]

task_pos_num = []
for task in task_id:
    task_pos_num.append(df[df['wsi_label_id'].isin(task)]['wsi_label_id'].count())
task_pos_num /= sum(task_pos_num) 
task_neg_num = [int(p * len(df_nilm)) for p in task_pos_num]
task_neg_num[-1] = len(df_nilm) - sum(task_neg_num[:-1])

df_nilm.iloc[:task_neg_num[0], df_nilm.columns.get_loc('wsi_label_id')] = task_neg_id[0]
df_nilm.iloc[task_neg_num[0]:task_neg_num[0]+task_neg_num[1], df_nilm.columns.get_loc('wsi_label_id')] = task_neg_id[1]
df_nilm.iloc[task_neg_num[0]+task_neg_num[1]:, df_nilm.columns.get_loc('wsi_label_id')] = task_neg_id[2]
print(df_nilm['wsi_label_id'].value_counts())

df = pd.concat([df, df_nilm], ignore_index=True)

df_shuffle = df.sample(frac=1)
print(df_shuffle.head())
num = len(df_shuffle)
train_num = int(num * 0.8)
val_num = int(num * 0.1)
test_num = num - train_num - val_num

df_shuffle = df_shuffle.drop('wsi_label', axis=1)
df_train = df_shuffle[:train_num]
df_val = df_shuffle[train_num:train_num+val_num]
df_test = df_shuffle[train_num+val_num:]
dfs = [df_train, df_val, df_test]
for df in dfs:
    print('#################################')
    print(df['wsi_label_id'].value_counts())

df_train.to_csv(file_paths[1], index=False, header=False)
df_val.to_csv(file_paths[2], index=False, header=False)
df_test.to_csv(file_paths[3], index=False, header=False)

          wsi_id wsi_label  wsi_label_id
4       C02S2647      NILM             0
5  T-SZ230214031        BV             7
6    T2307040262        BV             7
7    T2309220210        BV             7
8       C02S3084      NILM             0
wsi_label_id
0    723
4    449
2     31
Name: count, dtype: int64
               wsi_id wsi_label  wsi_label_id
889     T-SZ230318013      NILM             0
1674  CX20168796-NILM      NILM             4
414        CX20105164    ASC-US             1
1427         L2012136      NILM             0
849        T220125251      NILM             0
#################################
wsi_label_id
0    575
1    383
4    349
7    148
6     72
5     30
2     24
3     15
Name: count, dtype: int64
#################################
wsi_label_id
0    70
4    60
1    46
7    13
2     5
6     4
5     1
Name: count, dtype: int64
#################################
wsi_label_id
0    78
1    48
4    40
7    16
6    10
3     6
2     2
5     1
Name: count, dtype: int64
