In [4]:
import pandas as pd
import json
import os

# --- 配置区 ---
dataset_list = ['20NG', 'AGNews', 'DBPedia', 'TREC', 'X-Topic', 'Yahoo', 'banking', 'stackoverflow', 'clinc', 'hwu', 'mcid', 'news', 'ele'] 

# 用于生成统计的代表性 Fold
F_NUM = 5 
P_NUM = 0

# 要统计的已标注数据比例
LABELED_DATA_RATIO = 0.1

# --- 修正后的主逻辑 ---
pd_ans = {'dataset': [], 'ratio':[], 'split':[], 'value':[]}
print("--- Step 2, Block 1: Calculating all statistics ---")

for dataset_name in dataset_list:
    try:
        # 1. 统计总样本数和总标签数
        total_train = len(pd.read_csv(f'{dataset_name}/origin_data/train.tsv', sep='\t'))
        total_dev = len(pd.read_csv(f'{dataset_name}/origin_data/dev.tsv', sep='\t'))
        total_test = len(pd.read_csv(f'{dataset_name}/origin_data/test.tsv', sep='\t'))
        total_labels = len(pd.read_csv(f'{dataset_name}/label/label.list', header=None))

        pd_ans['dataset'].extend([dataset_name] * 4)
        pd_ans['ratio'].extend([1.0] * 4)
        pd_ans['split'].extend(['train', 'dev', 'test', '\\#label']) # 使用带转义的\#label
        pd_ans['value'].extend([total_train, total_dev, total_test, total_labels])
        
        # 2. 统计不同已知类别比例 (rate) 下的细分数据
        for rate in [0.25, 0.5, 0.75]:
            known_label_path = f'{dataset_name}/label/fold{F_NUM}/part{P_NUM}/label_known_{rate}.list'
            known_labels = pd.read_csv(known_label_path, header=None)[0].tolist()
            
            pd_ans['dataset'].append(dataset_name)
            pd_ans['ratio'].append(rate)
            pd_ans['split'].append('\\#label') # 使用带转义的\#label
            pd_ans['value'].append(len(known_labels))
            
            for split in ['train', 'dev']:
                labeled_data_path = f'{dataset_name}/labeled_data/{LABELED_DATA_RATIO}/{split}.tsv'
                df_labeled = pd.read_csv(labeled_data_path, sep='\t')
                sub_df = df_labeled[
                    df_labeled['label'].isin(known_labels) & (df_labeled['labeled'] == True)
                ]
                pd_ans['dataset'].append(dataset_name)
                pd_ans['ratio'].append(rate)
                pd_ans['split'].append(split)
                pd_ans['value'].append(len(sub_df))

    except FileNotFoundError as e:
        print(f"    [错误] 统计失败，缺少文件: {e.filename}，已跳过 {dataset_name}")
        continue
print("--- Data calculation complete. ---")

--- Step 2, Block 1: Calculating all statistics ---
--- Data calculation complete. ---


In [5]:
pd_ans = pd.DataFrame(pd_ans).sort_values(['dataset', 'ratio', 'split'], ascending=[True, True, False]).drop_duplicates()
pivot_ans = pd_ans.pivot(index='dataset', columns=['ratio', 'split'], values=['value'])
pivot_ans = pivot_ans.applymap(lambda x: "{:,}".format(x))
pivot_ans.to_excel('data_statics.xlsx')
pivot_ans

  pivot_ans = pivot_ans.applymap(lambda x: "{:,}".format(x))


Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value
ratio,0.25,0.25,0.25,0.50,0.50,0.50,0.75,0.75,0.75,1.00,1.00,1.00,1.00
split,train,dev,\#label,train,dev,\#label,train,dev,\#label,train,test,dev,\#label
dataset,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
20NG,183,25,5,351,48,10,518,71,15,7000,2000,1000,20
AGNews,175,25,1,350,50,2,525,75,3,7000,2000,1000,4
DBPedia,252,55,55,480,110,110,647,164,164,7000,2000,1000,219
TREC,324,35,12,435,51,24,468,62,35,4849,490,532,47
X-Topic,684,42,16,702,90,32,702,108,48,7000,628,978,64
Yahoo,140,20,2,350,50,5,560,80,8,7000,2000,1000,10
banking,229,28,19,472,55,38,704,81,58,9003,3080,1000,77
clinc,457,76,38,901,150,75,1345,224,112,18000,2250,2250,150
ele,719,119,12,1498,249,25,2276,379,38,29823,14794,4942,50
hwu,187,26,16,361,51,32,557,79,48,7712,1032,933,64


In [6]:
data_statics = pivot_ans.to_latex().replace(' ', '').replace('0000', '')
replace_map = {
    "Classes": '|Classes|',
    "train-labeled": '|Train|',
    "train": '|Train|',
    "eval": '|Validation|',
    "test": '|Test|',
    "banking": 'BANKING',
    "clinc": 'CLINC',
    "stackoverflow": 'STACK',
    "thucnews": 'CNEWS',
    "atis": 'ATIS',
    "snips": 'SNIPS',
    "ele": 'REVIEWS',
    "news": 'NEWSG',
    "0000": "",
    ".00": "",
    "lrrrrrrr": "l|p{1.2cm}p{1.2cm}p{1.2cm}p{1.2cm}p{1.2cm}p{1.2cm}p{1.2cm}",
    "toprule\n": "toprule\nDataset"
}
for i, v in replace_map.items():
    data_statics = data_statics.replace(i, v)
print(data_statics.replace(' ', ''))

\begin{tabular}{llllllllllllll}
\toprule
Dataset&\multicolumn{13}{r}{value}\\
ratio&\multicolumn{3}{r}{0.25}&\multicolumn{3}{r}{0.50}&\multicolumn{3}{r}{0.75}&\multicolumn{4}{r}{1}\\
split&|Train|&dev&\#label&|Train|&dev&\#label&|Train|&dev&\#label&|Train|&|Test|&dev&\#label\\
dataset&&&&&&&&&&&&&\\
\midrule
20NG&183&25&5&351&48&10&518&71&15&7,000&2,000&1,000&20\\
AGNews&175&25&1&350&50&2&525&75&3&7,000&2,000&1,000&4\\
DBPedia&252&55&55&480&110&110&647&164&164&7,000&2,000&1,000&219\\
TREC&324&35&12&435&51&24&468&62&35&4,849&490&532&47\\
X-Topic&684&42&16&702&90&32&702&108&48&7,000&628&978&64\\
Yahoo&140&20&2&350&50&5&560&80&8&7,000&2,000&1,000&10\\
BANKING&229&28&19&472&55&38&704&81&58&9,003&3,080&1,000&77\\
CLINC&457&76&38&901&150&75&1,345&224&112&18,000&2,250&2,250&150\\
REVIEWS&719&119&12&1,498&249&25&2,276&379&38&29,823&14,794&4,942&50\\
hwu&187&26&16&361&51&32&557&79&48&7,712&1,032&933&64\\
mcid&31&4&4&62&8&8&94&12&12&1,198&331&170&16\\
NEWSG&183&25&5&355&49&10&539&74&15&7,000&2,0