In [1]:
from data_process_drain3 import process
import csv
import random
import pandas as pd
from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
input_csv = '../dataset/otlp/exported_data_1114-1120.csv'
output_dir = '../dataset/otlp_f1/'
export_data_dir = output_dir + 'export_data/'
label_dir = output_dir + 'label/'
result_dir = '../output/otlp_parser_config/model/result_v5/'

In [3]:
def openfile(input_filename):
    with open(input_filename, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        rows = list(reader)
    return reader, rows

def tracegroup(rows):
    trace_groups = {}
    for row in rows:
        trace_id = row['TraceId']
        if trace_id not in trace_groups:
            trace_groups[trace_id] = []
        trace_groups[trace_id].append(row)
    return trace_groups

def outputfile(reader, trace_groups, output_filename):
    with open(output_filename, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
        writer.writeheader()
        for trace_group in trace_groups.values():
            for row in trace_group:
                writer.writerow(row)
                
def output_label(trace_groups, modified_trace_ids, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['TraceId', 'is_anomaly'])
        writer.writeheader()
        for trace_id in trace_groups.keys():
            is_modified = 'True' if trace_id in modified_trace_ids else 'False'
            writer.writerow({'TraceId': trace_id, 'is_anomaly': is_modified})

In [5]:
reader, rows = openfile(input_csv)
trace_groups = tracegroup(rows)

# 隨機選擇5%的trace進行處理
selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))

# 處理每個trace，刪除Timestamp最晚的3個資料
for trace_id, traces in trace_groups.items():
    if trace_id in selected_trace_ids:
        traces.sort(key=lambda x: x['Timestamp'], reverse=True)  # 根據Timestamp降序排列
        del traces[:3]  # 刪除最晚的3個資料

output_csv = export_data_dir + 'exported_data_remove3.csv'
label_csv = label_dir + 'label_remove3.csv'

outputfile(reader, trace_groups, output_csv)

# 輸出被修改過的trace記錄
output_label(trace_groups, selected_trace_ids, label_csv)

since Python 3.9 and will be removed in a subsequent version.
  selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))


In [9]:
def process_csv(input_filename, output_filename):
    
    reader, rows = openfile(input_filename)
    trace_groups = tracegroup(rows)

    # 處理每個trace，刪除Timestamp最晚的一半資料
    for trace_id, traces in trace_groups.items():
        traces.sort(key=lambda x: x['Timestamp'], reverse=True)  # 根據Timestamp降序排列
        num_to_remove = len(traces) // 2  # 決定刪除一半的資料
        del traces[:num_to_remove]  # 刪除最晚的一半資料

    outputfile(reader, trace_groups, output_filename)

output_csv = output_dir + 'exported_data_removed.csv'
process_csv(input_csv, output_csv)

In [21]:
def process_csv(input_filename, output_filename):
    reader, rows = openfile(input_filename)
    trace_groups = tracegroup(rows)

    # 處理每個trace
    for trace_id, grouped_rows in trace_groups.items():
        # 找出所有Content不同的數據
        unique_contents = list({row['Content'] for row in grouped_rows})
        if len(unique_contents) > 1:
            # 隨機選擇兩筆Content不同的數據
            content_to_swap = random.sample(unique_contents, 2)
            rows_to_swap = [row for row in grouped_rows if row['Content'] in content_to_swap]

            # 如果找到兩筆資料，則交換它們的Content
            if len(rows_to_swap) == 2:
                rows_to_swap[0]['Content'], rows_to_swap[1]['Content'] = rows_to_swap[1]['Content'], rows_to_swap[0]['Content']

    outputfile(reader, trace_groups, output_filename)

output_csv = output_dir + 'exported_data_switch.csv'
process_csv(input_csv, output_csv)


In [21]:
reader, rows = openfile(input_csv)
trace_groups = tracegroup(rows)

# 隨機選擇5%的trace進行處理
selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))

# 處理每個trace
for trace_id, grouped_rows in trace_groups.items():
    if trace_id in selected_trace_ids:
        # 根據Timestamp降序排列找到最後一筆數據
        grouped_rows.sort(key=lambda x: x['Timestamp'], reverse=True)
        last_row = grouped_rows[0]
        # 複製最後一筆數據一百次
        for _ in range(100):
            grouped_rows.append(last_row.copy())  # 使用copy以避免引用同一個字典對象
        
output_csv = export_data_dir + 'exported_data_tail100.csv'
label_csv = label_dir + 'label_tail100.csv'

outputfile(reader, trace_groups, output_csv)

# 輸出被修改過的trace記錄
output_label(trace_groups, selected_trace_ids, label_csv)

since Python 3.9 and will be removed in a subsequent version.
  selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))


In [7]:
reader, rows = openfile(input_csv)
trace_groups = tracegroup(rows)

# 隨機選擇5%的trace進行處理
selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))

# 處理每個trace
for trace_id, grouped_rows in trace_groups.items():
    if trace_id in selected_trace_ids:
        # 篩選出Content開頭不是"Select..."的數據
        non_select_rows = [row for row in grouped_rows if not row['Content'].startswith("Select")]
        if non_select_rows:
            # 從符合條件的數據中隨機選擇一筆數據
            selected_row = random.choice(non_select_rows)
            # 複製選定的數據一百次
            for _ in range(100):
                grouped_rows.append(selected_row.copy())  # 使用copy以避免引用同一個字典對象

output_csv = export_data_dir + 'exported_data_random100.csv'
label_csv = label_dir + 'label_random100.csv'

outputfile(reader, trace_groups, output_csv)

# 輸出被修改過的trace記錄
output_label(trace_groups, selected_trace_ids, label_csv)

since Python 3.9 and will be removed in a subsequent version.
  selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))


In [16]:
reader, rows = openfile(input_csv)
trace_groups = tracegroup(rows)

# 隨機選擇5%的trace進行處理
selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))

# 處理每個trace
for trace_id, grouped_rows in trace_groups.items():
    if trace_id in selected_trace_ids:
        # 根據Timestamp降序排列
        grouped_rows.sort(key=lambda x: x['Timestamp'], reverse=True)
        
        # 找出不以"Select..."開頭的連續兩筆數據
        pair_indices = [(i, i+1) for i in range(len(grouped_rows)-1) 
                        if not (grouped_rows[i]['Content'].startswith("Select") and 
                                grouped_rows[i+1]['Content'].startswith("Select"))]
        
        if pair_indices:
            # 隨機選擇一對連續的數據
            selected_pair_index = random.choice(pair_indices)
            selected_rows = grouped_rows[selected_pair_index[0]:selected_pair_index[1]+1]
            
            # 複製選定的數據一百次
            for _ in range(100):
                grouped_rows.extend([row.copy() for row in selected_rows])

output_csv = export_data_dir + 'exported_data_pair100.csv'
label_csv = label_dir + 'label_pair100.csv'

outputfile(reader, trace_groups, output_csv)

# 輸出被修改過的trace記錄
output_label(trace_groups, selected_trace_ids, label_csv)

since Python 3.9 and will be removed in a subsequent version.
  selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))


In [14]:
def process_csv(input_filename, output_filename):
    reader, rows = openfile(input_filename)
    trace_groups = tracegroup(rows)

    # 處理每個trace，隨機打亂順序
    for trace_id, grouped_rows in trace_groups.items():
        random.shuffle(grouped_rows)
                
    outputfile(reader, trace_groups, output_filename)

output_csv = output_dir + 'exported_data_shuffle.csv'
process_csv(input_csv, output_csv)

In [15]:
def process_csv(input_filename, output_filename):
    reader, rows = openfile(input_filename)
    trace_groups = tracegroup(rows)

    # 處理每個trace，刪除所有"Content"開頭為"Select"的數據
    for trace_id, grouped_rows in trace_groups.items():
        # 篩選出"Content"開頭不是"Select"的數據
        trace_groups[trace_id] = [row for row in grouped_rows if not row['Content'].startswith("Select")]
                    
    outputfile(reader, trace_groups, output_filename)

output_csv = output_dir + 'exported_data_deleteDatabase.csv'
process_csv(input_csv, output_csv)

In [16]:
def process_csv(input_filename, output_filename):
    reader, rows = openfile(input_filename)
    trace_groups = tracegroup(rows)

    # 處理每個trace，刪除所有"Content"開頭為"Select"的數據
    for trace_id, rows in trace_groups.items():
        # 計算要插入的 "Error" 數據的數量，為當前 trace 數量的 30%
        insert_count = int(len(rows) * 0.4)

        # 對每個要插入的數據
        for _ in range(insert_count):
            # 隨機選擇一筆現有的數據來複製其值（除了 Content）
            template_row = random.choice(rows)
            new_row = template_row.copy()
            
            # 將 "Content" 設置為 "Error"
            new_row['Content'] = "Error"
            
            # 將這筆新數據插入到 rows 列表中的隨機位置
            insert_position = random.randint(0, len(rows))
            rows.insert(insert_position, new_row)

    outputfile(reader, trace_groups, output_filename)

output_csv = output_dir + 'exported_data_insert.csv'
process_csv(input_csv, output_csv)

In [6]:
reader, rows = openfile(input_csv)
trace_groups = tracegroup(rows)

# 隨機選擇5%的trace進行處理
selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))

# 處理每個trace
for trace_id, grouped_rows in trace_groups.items():
    if trace_id in selected_trace_ids:
        # 篩選出開頭不為"Select"的數據
        non_select_rows = [row for row in grouped_rows if not row['Content'].startswith("Select")]
        if non_select_rows:
            # 隨機選擇一筆數據
            selected_row = random.choice(non_select_rows)
            selected_content = selected_row['Content']
            # 將選定的Content替換該組中的其他所有數據的Content
            for row in grouped_rows:
                row['Content'] = selected_content

output_csv = export_data_dir + 'exported_data_replace.csv'
label_csv = label_dir + 'label_replace.csv'

outputfile(reader, trace_groups, output_csv)

# 輸出被修改過的trace記錄
output_label(trace_groups, selected_trace_ids, label_csv)

since Python 3.9 and will be removed in a subsequent version.
  selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))


In [14]:
reader, rows = openfile(input_csv)
trace_groups = tracegroup(rows)

# 隨機選擇5%的trace進行處理
selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))

# 處理每個trace
for trace_id, grouped_rows in trace_groups.items():
    if trace_id in selected_trace_ids:
        # 篩選出Content不同的數據
        unique_contents = list({row['Content'] for row in grouped_rows})
        
        if len(unique_contents) >= 2:
            # 隨機選擇兩個不同的Content
            selected_contents = random.sample(unique_contents, 2)
            
            # 分別找到這兩個Content對應的數據
            selected_rows = [next(row for row in grouped_rows if row['Content'] == content) for content in selected_contents]
            
            # 清空原分組數據，並加入新的數據
            trace_groups[trace_id] = []
            for _ in range(50):
                trace_groups[trace_id].append(selected_rows[0].copy())
            for _ in range(50):
                trace_groups[trace_id].append(selected_rows[1].copy())

output_csv = export_data_dir + 'exported_data_5050.csv'
label_csv = label_dir + 'label_5050.csv'

outputfile(reader, trace_groups, output_csv)

# 輸出被修改過的trace記錄
output_label(trace_groups, selected_trace_ids, label_csv)

since Python 3.9 and will be removed in a subsequent version.
  selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))


In [11]:
reader, rows = openfile(input_csv)
trace_groups = tracegroup(rows)

# 隨機選擇5%的trace進行處理
selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))

# 處理每個trace
for trace_id, grouped_rows in trace_groups.items():
    if trace_id in selected_trace_ids:
        # 篩選出Content不同的數據
        unique_contents = list({row['Content'] for row in grouped_rows})
        
        if len(unique_contents) >= 3:
            # 隨機選擇兩個不同的Content
            selected_contents = random.sample(unique_contents, 3)
            
            # 分別找到這兩個Content對應的數據
            selected_rows = [next(row for row in grouped_rows if row['Content'] == content) for content in selected_contents]
            
            # 清空原分組數據，並加入新的數據
            trace_groups[trace_id] = []
            for _ in range(30):
                trace_groups[trace_id].append(selected_rows[0].copy())
            for _ in range(30):
                trace_groups[trace_id].append(selected_rows[1].copy())
            for _ in range(30):
                trace_groups[trace_id].append(selected_rows[2].copy())

output_csv = export_data_dir + 'exported_data_303030.csv'
label_csv = label_dir + 'label_303030.csv'

outputfile(reader, trace_groups, output_csv)

# 輸出被修改過的trace記錄
output_label(trace_groups, selected_trace_ids, label_csv)

since Python 3.9 and will be removed in a subsequent version.
  selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))


In [18]:
reader, rows = openfile(input_csv)
trace_groups = tracegroup(rows)

# 隨機選擇5%的trace進行處理
selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))

# 處理每個trace
for trace_id, grouped_rows in trace_groups.items():
    if trace_id in selected_trace_ids:
        repeated_trace = grouped_rows * 10  # 將分組中的數據循環10次
        trace_groups[trace_id] = repeated_trace

output_csv = export_data_dir + 'exported_data_loop10.csv'
label_csv = label_dir + 'label_loop10.csv'

outputfile(reader, trace_groups, output_csv)

# 輸出被修改過的trace記錄
output_label(trace_groups, selected_trace_ids, label_csv)

since Python 3.9 and will be removed in a subsequent version.
  selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))


In [19]:
reader, rows = openfile(input_csv)
trace_groups = tracegroup(rows)

# 隨機選擇5%的trace進行處理
selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))

# 處理每個trace
for trace_id, grouped_rows in trace_groups.items():
    if trace_id in selected_trace_ids:
        # 創建一個新的列表來存儲複製後的數據
        duplicated_rows = []

        for row in grouped_rows:
            # 對每筆數據進行10次複製
            duplicated_rows.extend([row.copy() for _ in range(10)])

        # 更新該TraceId分組的數據
        trace_groups[trace_id] = duplicated_rows
        
output_csv = export_data_dir + 'exported_data_multi10.csv'
label_csv = label_dir + 'label_multi10.csv'

outputfile(reader, trace_groups, output_csv)

# 輸出被修改過的trace記錄
output_label(trace_groups, selected_trace_ids, label_csv)

since Python 3.9 and will be removed in a subsequent version.
  selected_trace_ids = random.sample(trace_groups.keys(), k=int(len(trace_groups) * 0.05))


In [5]:
import pandas as pd
import os

directory_path = result_dir

for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        
        df = pd.read_csv(file_path)
        
        anomaly_count = df[df['is_anomaly'] == True].shape[0]
        
        print(f"File: {filename} has {anomaly_count} anomalies.")

File: result_df_tail100.csv has 39 anomalies.
File: result_df_5050.csv has 68 anomalies.
File: result_df_replace.csv has 68 anomalies.
File: result_df_random100.csv has 67 anomalies.
File: result_df_303030.csv has 67 anomalies.
File: result_df_1114-1120.csv has 8 anomalies.
File: result_df_loop10.csv has 45 anomalies.
File: result_df_multi10.csv has 51 anomalies.
File: result_df_pair100.csv has 67 anomalies.


In [42]:
# 讀取異常標記文件
anomaly_df = pd.read_csv('../output/otlp_parser_config/model/result_v4/result_df_1114-1120.csv')

# 獲取所有被標記為異常的TraceId
anomaly_trace_ids = set(anomaly_df[anomaly_df['is_anomaly'] == True]['TraceId'])

# 讀取原始trace數據
exported_data_df = pd.read_csv('../dataset/otlp/exported_data_1114-1120.csv')

# 移除被標記為異常的trace
exported_data_df = exported_data_df[~exported_data_df['TraceId'].isin(anomaly_trace_ids)]

# 保存處理後的數據
exported_data_df.to_csv('../dataset/otlp/exported_data_cleaned.csv', index=False)

In [7]:
def calculate_metrics(label_file, result_file):
    # 讀取真實標籤和預測結果
    label_df = pd.read_csv(label_file)
    result_df = pd.read_csv(result_file)

    # 確保is_anomaly欄位是布爾型
    label_df['is_anomaly'] = label_df['is_anomaly'].astype(bool)
    result_df['is_anomaly'] = result_df['is_anomaly'].astype(bool)

    # 合併數據
    merged_df = pd.merge(label_df, result_df, on='TraceId', suffixes=('_true', '_pred'))

    # 獲取真實標籤和預測結果
    y_true = merged_df['is_anomaly_true']
    y_pred = merged_df['is_anomaly_pred']

    # 計算recall, precision, f1-score
    recall = recall_score(y_true, y_pred, pos_label=True)
    precision = precision_score(y_true, y_pred, pos_label=True)
    f1 = f1_score(y_true, y_pred, pos_label=True)

    return recall, precision, f1

# 獲取所有label和result文件
label_files = [f for f in os.listdir(label_dir) if f.startswith('label_')]
result_files = [f for f in os.listdir(result_dir) if f.startswith('result_df_')]


label_files.sort()
result_files.sort()


for label_file, result_file in zip(label_files, result_files):
    recall, precision, f1 = calculate_metrics(label_dir + label_file, result_dir + result_file)
    print(f"File: {label_file} {result_file} | Recall: {recall}, Precision: {precision}, F1-Score: {f1}")

File: label_303030.csv result_df_303030.csv | Recall: 1.0, Precision: 0.8955223880597015, F1-Score: 0.9448818897637796
File: label_5050.csv result_df_5050.csv | Recall: 0.9833333333333333, Precision: 0.8676470588235294, F1-Score: 0.9218749999999999
File: label_loop10.csv result_df_loop10.csv | Recall: 0.6166666666666667, Precision: 0.8222222222222222, F1-Score: 0.7047619047619048
File: label_multi10.csv result_df_multi10.csv | Recall: 0.7333333333333333, Precision: 0.8627450980392157, F1-Score: 0.7927927927927927
File: label_pair100.csv result_df_pair100.csv | Recall: 1.0, Precision: 0.8955223880597015, F1-Score: 0.9448818897637796
File: label_random100.csv result_df_random100.csv | Recall: 1.0, Precision: 0.8955223880597015, F1-Score: 0.9448818897637796
File: label_replace.csv result_df_replace.csv | Recall: 1.0, Precision: 0.8823529411764706, F1-Score: 0.9375
File: label_tail100.csv result_df_tail100.csv | Recall: 0.5333333333333333, Precision: 0.8205128205128205, F1-Score: 0.6464646

In [12]:
df = pd.read_csv("../dataset/otlp_1209/export_data/exported_data_1209.csv")

if 'Content' in df.columns:
    # df['Content'] = df['Content'].str.replace('[^a-zA-Z0-9]', ' ', regex=True).str.lower()
    df['Content'] = df['Content'].str.lstrip('/')

df.to_csv("../dataset/otlp_1209/export_data/exported_data_1209_noslash.csv", index=False)