In [None]:
import os
import re
import pandas as pd
from typing import List, Dict

def parse_contextual_anomaly_files(folder_path: str, prefix: str = "contextual_anomaly_scores") -> pd.DataFrame:
    """
    解析具有相同前缀的上下文异常分数文件，并创建DataFrame
    
    Parameters:
    -----------
    folder_path : str
        包含txt文件的文件夹路径
    prefix : str
        文件前缀，默认为"contextual_anomaly_scores"
    
    Returns:
    --------
    pd.DataFrame
        包含所有文件数据的DataFrame
    """
    
    # 存储所有文件数据的列表
    data_list = []
    
    # 遍历文件夹中的所有文件
    for filename in os.listdir(folder_path):
        if filename.startswith(prefix) and filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                
                # 解析文件内容
                file_data = parse_single_file_content(content, filename)
                if file_data:
                    data_list.append(file_data)
                    
            except Exception as e:
                print(f"Error reading file {filename}: {e}")
                continue
    
    # 创建DataFrame
    if data_list:
        df = pd.DataFrame(data_list)
        return df
    else:
        print("No valid files found.")
        return pd.DataFrame()

def parse_single_file_content(content: str, filename: str) -> Dict:
    """
    解析单个文件的内容
    
    Parameters:
    -----------
    content : str
        文件内容
    filename : str
        文件名
    
    Returns:
    --------
    Dict
        包含解析数据的字典
    """
    
    # 初始化数据字典
    file_data = {
        'filename': filename,
        'detector': None,
        'pr_auc': None,
        'precision_at_50': None,
        'context_list': [],
        'context_count': None
    }
    
    # 分割内容为行
    lines = content.strip().split('\n')
    
    for i, line in enumerate(lines):
        line = line.strip()
        
        # 解析检测器名称
        if line.startswith('Best Detector:'):
            file_data['detector'] = line.replace('Best Detector:', '').strip()
        
        # 解析PR-AUC
        elif line.startswith('PR-AUC:'):
            pr_auc_str = line.replace('PR-AUC:', '').strip()
            try:
                file_data['pr_auc'] = float(pr_auc_str)
            except ValueError:
                print(f"Warning: Could not parse PR-AUC value in {filename}")
        
        # 解析Precision@50
        elif line.startswith('Precision@50:'):
            precision_str = line.replace('Precision@50:', '').strip()
            try:
                file_data['precision_at_50'] = float(precision_str)
            except ValueError:
                print(f"Warning: Could not parse Precision@50 value in {filename}")
        
        # 解析上下文列表（以方括号开头的行）
        elif line.startswith('['):
            # 提取列表内容
            list_content = line.strip('[]')
            # 分割列表项
            context_items = [item.strip().strip("'\"") for item in list_content.split(', ')]
            file_data['context_list'] = context_items
            
            # 最后两个数字为上下文数量
            line_digit = line[line.rfind(']') + 1:].strip()
            digits = re.findall(r'\d+', line_digit)
            print(digits)
            if digits:
                file_data['context_count'] = int(digits[-1])
    
    return file_data


# 使用示例
if __name__ == "__main__":
    # 指定文件夹路径
    folder_path = "."  # 替换为你的文件夹路径
    
    # 解析文件
    df = parse_contextual_anomaly_files(folder_path)
    
    if not df.empty:
        print("解析结果:")
        print(df[['filename', 'detector', 'pr_auc', 'precision_at_50', 'context_count']])
        
        # 展开上下文列表
        # file_data = (df)
        print("\n展开后的DataFrame:")
        # print(file_data.head())
        
        # 保存到CSV文件（可选）
        # expanded_df.to_csv('contextual_anomaly_analysis.csv', index=False)
        # print("\n结果已保存到 contextual_anomaly_analysis.csv")

[]
[]
['29']
['31']
['27']
['27']
['34']
['32']
['28']
['29']
['33']
['33']
[]
['33']
['27']
['28']
['29']
['27']
['28']
['31']
['31']
['30']
['31']
[]
['28']
['31']
['32']
['27']
['31']
['33']
['27']
['32']
['34']
['34']
['34']
['32']
['32']
['31']
['33']
['31']
['28']
['29']
['28']
['32']
['34']
['30']
['32']
['31']
['30']
['30']
['29']
['34']
['31']
['32']
['32']
['31']
['28']
['31']
['27']
['27']
['32']
['29']
['28']
['30']
['31']
['32']
['31']
['33']
['34']
['30']
['33']
['28']
['27']
['34']
['32']
['34']
['32']
['27']
['34']
['32']
['32']
['33']
['30']
['28']
['34']
['28']
['34']
['34']
['27']
['27']
['32']
['32']
['29']
['33']
['28']
['28']
['29']
['29']
['28']
['29']
['32']
['33']
['27']
['28']
解析结果:
                                           filename       detector  pr_auc  \
0       contextual_anomaly_scores_LOF_k50_c0.05.txt  LOF_k50_c0.05  0.2346   
1     contextual_anomaly_scores_LOF_k50_c0.05_1.txt  LOF_k50_c0.05  0.2346   
2    contextual_anomaly_scores_LOF_k50_c0.05_10.

In [None]:
# sort by pr_auc descending
df = df.sort_values(by='pr_auc', ascending=False).reset_index(drop=True)
# 选前20项
df_top20 = df.head(20)
display(df_top20)

Unnamed: 0,filename,detector,pr_auc,precision_at_50,context_list,context_count
0,contextual_anomaly_scores_LOF_k50_c0.05_73.txt,LOF_k50_c0.05,0.2673,0.48,"[age_treatment_middle_no_treatment, special_on...",28.0
1,contextual_anomaly_scores_LOF_k50_c0.05_94.txt,LOF_k50_c0.05,0.2662,0.76,"[age_treatment_elderly_post_treatment, age_sex...",28.0
2,contextual_anomaly_scores_LOF_k50_c0.05_60.txt,LOF_k50_c0.05,0.2582,0.7,"[age_sex_middle_0.0, age_sex_senior_0.0, age_t...",31.0
3,contextual_anomaly_scores_LOF_k50_c0.05_7.txt,LOF_k50_c0.05,0.2576,0.76,"[age_sex_young_0.0, age_treatment_elderly_on_t...",33.0
4,contextual_anomaly_scores_LOF_k50_c0.05_34.txt,LOF_k50_c0.05,0.257,0.66,"[age_sex_middle_1.0, special_only_on_lithium, ...",31.0
5,contextual_anomaly_scores_LOF_k50_c0.05_80.txt,LOF_k50_c0.05,0.2555,0.74,"[age_treatment_young_no_treatment, treatment_o...",32.0
6,contextual_anomaly_scores_LOF_k50_c0.05_86.txt,LOF_k50_c0.05,0.2542,0.58,"[age_treatment_senior_no_treatment, treatment_...",28.0
7,contextual_anomaly_scores_LOF_k50_c0.05_67.txt,LOF_k50_c0.05,0.253,0.7,"[sex_special_1.0_general, age_sex_young_1.0, s...",31.0
8,contextual_anomaly_scores_LOF_k50_c0.05_8.txt,LOF_k50_c0.05,0.2509,0.64,"[special_only_on_lithium, sex_special_1.0_on_l...",34.0
9,contextual_anomaly_scores_LOF_k50_c0.05_20.txt,LOF_k50_c0.05,0.2509,0.64,"[age_treatment_senior_no_treatment, treatment_...",33.0
