In [6]:
import os
import pandas as pd
import numpy as np
np.random.seed(2024)
root_dir = 'labels'
files = ['all_label.csv', 'train_label.csv', 'val_label.csv', 'test_label.csv']
file_paths = [os.path.join(root_dir, file) for file in files]
label_to_id = {
    'NILM': 0, 
    'ASC-US': 1,
    'LSIL': 2,
    'ASC-H': 3,
    'HSIL': 4,
    'AGC': 6,
    'T': 8,
    'M': 9,
    'BV': 10,
	}
task_id = [[1,2,3,4], [6], [8,9,10]]
task_neg_id = [0,5,7]
labels_id = []

df = pd.read_csv(file_paths[0])
df = df[df['wsi_label'] != 'NILM-atrophy']
for label in df['wsi_label']:
    labels_id.append(label_to_id[label])
df = df.drop('id', axis=1)
df['wsi_id'] = [wsi_id.replace('.pt', '') for wsi_id in df['wsi_id']]
df['wsi_label_id'] = labels_id

print(df.head())

# 将标签为0的行按照阳性数量比例划分给各个task
df_nilm = df[df['wsi_label_id'] == 0]
df = df[df['wsi_label_id'] != 0]

task_pos_num = []
for task in task_id:
    task_pos_num.append(df[df['wsi_label_id'].isin(task)]['wsi_label_id'].count())
task_pos_num /= sum(task_pos_num) 
task_neg_num = [int(p * len(df_nilm)) for p in task_pos_num]
task_neg_num[-1] = len(df_nilm) - sum(task_neg_num[:-1])

df_nilm.iloc[:task_neg_num[0], df_nilm.columns.get_loc('wsi_label_id')] = task_neg_id[0]
df_nilm.iloc[task_neg_num[0]:task_neg_num[0]+task_neg_num[1], df_nilm.columns.get_loc('wsi_label_id')] = task_neg_id[1]
df_nilm.iloc[task_neg_num[0]+task_neg_num[1]:, df_nilm.columns.get_loc('wsi_label_id')] = task_neg_id[2]
print(df_nilm['wsi_label_id'].value_counts())

df = pd.concat([df, df_nilm], ignore_index=True)

df_shuffle = df.sample(frac=1)
print(df_shuffle.head())
num = len(df_shuffle)
train_num = int(num * 0.8)
val_num = int(num * 0.1)
test_num = num - train_num - val_num

df_shuffle = df_shuffle.drop('wsi_label', axis=1)
df_train = df_shuffle[:train_num]
df_val = df_shuffle[train_num:train_num+val_num]
df_test = df_shuffle[train_num+val_num:]
dfs = [df_train, df_val, df_test]
for df in dfs:
    print('#################################')
    print(df['wsi_label_id'].value_counts())

df_train.to_csv(file_paths[1], index=False, header=False)
df_val.to_csv(file_paths[2], index=False, header=False)
df_test.to_csv(file_paths[3], index=False, header=False)

          wsi_id wsi_label  wsi_label_id
4       C02S2647      NILM             0
5  T-SZ230214031        BV            10
6    T2307040262        BV            10
7    T2309220210        BV            10
8       C02S3084      NILM             0
wsi_label_id
0    723
7    449
5     31
Name: count, dtype: int64
               wsi_id wsi_label  wsi_label_id
1446  CX20175598-NILM      NILM             0
1219       T220113260      NILM             0
1910       T230108101      NILM             7
891     L2015731-NILM      NILM             0
1813         L2005369      NILM             7
#################################
wsi_label_id
0     585
7     352
1     163
10    146
2     137
9      72
3      46
4      29
5      27
8      26
6      13
Name: count, dtype: int64
#################################
wsi_label_id
0     67
7     54
1     22
10    15
2     14
9      9
3      7
8      4
6      4
4      2
5      1
Name: count, dtype: int64
#################################
wsi_label_id
0     71
7

# MTL任务改为one task任务

In [18]:
import os
import pandas as pd
import numpy as np
np.random.seed(42)
input_label_dir = 'labels'
output_label_dir = 'onetask_9_labels'
files = ['train_label.csv', 'val_label.csv', 'test_label.csv']
file_paths = [os.path.join(input_label_dir, file) for file in files]
id_to_id = {
    0: 0,
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 0,
    6: 5,
    7: 0,
    8: 6,
    9: 7,
    10: 8
}

for i, file_path in enumerate(file_paths):
    df = pd.read_csv(file_path, header=None)
    df_label = df[1]
    df_label = df_label.apply(lambda x: id_to_id[x])
    df[1] = df_label
    df.to_csv(os.path.join(output_label_dir, files[i]), index=False, header=False)
    print('##########################')
    print(df[1].value_counts())

##########################
1
0    964
1    163
8    146
2    137
7     72
3     46
4     29
6     26
5     13
Name: count, dtype: int64
##########################
1
0    122
1     22
8     15
2     14
7      9
3      7
6      4
5      4
4      2
Name: count, dtype: int64
##########################
1
0    117
1     24
2     20
8     16
4      7
3      6
7      5
5      4
6      2
Name: count, dtype: int64


# One task任务减少分类为5类

In [22]:
import os
import pandas as pd
import numpy as np
np.random.seed(42)
input_label_dir = 'onetask_9_labels'
output_label_dir = 'onetask_5_labels'
files = ['train_label.csv', 'val_label.csv', 'test_label.csv']
file_paths = [os.path.join(input_label_dir, file) for file in files]
ids_retain = [0, 1, 2, 3, 4]

for i, file_path in enumerate(file_paths):
    df = pd.read_csv(file_path, header=None)
    df_retain = df[df[1].isin(ids_retain)]
    df_retain.to_csv(os.path.join(output_label_dir, files[i]), index=False, header=False)
    print('##########################')
    print(df_retain[1].value_counts())

##########################
1
0    964
1    163
2    137
3     46
4     29
Name: count, dtype: int64
##########################
1
0    122
1     22
2     14
3      7
4      2
Name: count, dtype: int64
##########################
1
0    117
1     24
2     20
4      7
3      6
Name: count, dtype: int64


# one task 改变为2类

In [2]:
import os
import pandas as pd
import numpy as np
np.random.seed(42)
input_label_dir = 'onetask_5_labels'
output_label_dir = 'onetask_2_labels'
files = ['train_label.csv', 'val_label.csv', 'test_label.csv']
file_paths = [os.path.join(input_label_dir, file) for file in files]
id_to_id = {
    0: 0,
    1: 1,
    2: 1,
    3: 1,
    4: 1,
}

for i, file_path in enumerate(file_paths):
    df = pd.read_csv(file_path, header=None)
    df_label = df[1]
    df_label = df_label.apply(lambda x: id_to_id[x])
    df[1] = df_label
    df.to_csv(os.path.join(output_label_dir, files[i]), index=False, header=False)
    print('##########################')
    print(df[1].value_counts())

##########################
1
0    964
1    375
Name: count, dtype: int64
##########################
1
0    122
1     45
Name: count, dtype: int64
##########################
1
0    117
1     57
Name: count, dtype: int64
