In [1]:
from drain3.template_miner_config import TemplateMinerConfig
from drain3.file_persistence import FilePersistence
from drain3.template_miner import TemplateMiner
import pandas as pd
import re

In [6]:
config = TemplateMinerConfig()
persistence_handler = FilePersistence("drain3_state.json")  # 使用你之前保存状态的文件名
template_miner = TemplateMiner(persistence_handler)

config file not found: drain3.ini


In [7]:
new_log_line = "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.10.176:50010 is added to blk_3917002307947860468 size 67108864"
result = template_miner.match(new_log_line.strip())
print(result.cluster_id)

50


In [8]:
# 读取log_sorted_100.csv
df = pd.read_csv('../output/hdfs/log_sorted_100.csv', quotechar='"')

# 获取第一个blk的值
first_blk_value = df['blk'].iloc[0]

# 提取第一个blk的所有日志
first_blk_logs = df[df['blk'] == first_blk_value]['Content']

logkeys = []

# 使用template_miner.match解析这些日志
for log in first_blk_logs:
    result = template_miner.match(log)
    if result is not None:
        print(f"Log: {log}")
        print(f"Matched Template ID: {result.cluster_id}")
        print(f"Matched Template: {result.get_template()}\n")
        logkeys.append(result.cluster_id)
    else:
        print(f"No template matched for log: {log}\n")

print(logkeys)

Log: Receiving block blk_9223343671363935345 src: /10.251.106.50:53905 dest: /10.251.106.50:50010
Matched Template ID: 46
Matched Template: Receiving block <*> src: <*> dest: <*>

Log: Receiving block blk_9223343671363935345 src: /10.251.203.4:33104 dest: /10.251.203.4:50010
Matched Template ID: 46
Matched Template: Receiving block <*> src: <*> dest: <*>

Log: Receiving block blk_9223343671363935345 src: /10.251.106.50:45367 dest: /10.251.106.50:50010
Matched Template ID: 46
Matched Template: Receiving block <*> src: <*> dest: <*>

Log: BLOCK* NameSystem.allocateBlock: /user/root/sortrand2/_temporary/_task_200811101024_0003_r_000315_0/part-00315. blk_9223343671363935345
Matched Template ID: 47
Matched Template: BLOCK* NameSystem.allocateBlock: <*> <*>

Log: PacketResponder 1 for block blk_9223343671363935345 terminating
Matched Template ID: 48
Matched Template: PacketResponder <*> for block <*> <*>

Log: Received block blk_9223343671363935345 of size 67108864 from /10.251.106.50
Matche

In [9]:
# 打印所有已识别的日志模板
for idx, cluster in enumerate(template_miner.drain.clusters):
    print(f"Template {idx + 1}: {cluster.get_template()}")

Template 1: <*> <*> <*> INFO dfs.DataNode$DataXceiver: Receiving block <*> src: <*> dest: <*>
Template 2: <*> <*> <*> INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: <*> <*>
Template 3: <*> <*> <*> INFO dfs.DataNode$PacketResponder: PacketResponder <*> for block <*> <*>
Template 4: <*> <*> <*> INFO dfs.DataNode$PacketResponder: Received block <*> of size <*> from <*>
Template 5: <*> <*> <*> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: <*> is added to <*> size <*>
Template 6: <*> <*> <*> INFO dfs.DataNode$DataXceiver: Received block <*> src: <*> dest: <*> of size <*>
Template 7: <*> <*> <*> INFO dfs.DataNode$DataTransfer: <*> block <*> to <*>
Template 8: 081109 203521 19 INFO <*> <*> <*> <*> to <*> <*> <*> <*> <*> 10.251.71.193:50010
Template 9: <*> <*> <*> INFO dfs.DataNode$DataXceiver: <*> Served block <*> to <*>
Template 10: <*> <*> 19 INFO dfs.FSNamesystem: BLOCK* ask <*> to replicate <*> to datanode(s) <*> <*>
Template 11: <*> <*> <*> INFO dfs.

In [10]:
# 读取CSV文件
df = pd.read_csv('../output/hdfs/HDFS.log_structured.csv', dtype={'Date': str}, quotechar='"')

# 提取blk值
df['blk'] = df['Content'].apply(lambda x: re.search(r'blk_(-?\d+)', x).group(1) if re.search(r'blk_(-?\d+)', x) else None)

# 将blk值转换为整数
df['blk'] = pd.to_numeric(df['blk'], errors='coerce')

# 确保'Date'和'Time'列的数据都是字符串类型
df['Date'] = df['Date'].astype(str)
df['Time'] = df['Time'].astype(str)

# 检查'Date'和'Time'列中是否存在非数字字符
invalid_date = df[~df['Date'].str.isnumeric()]
invalid_time = df[~df['Time'].str.isnumeric()]

# 如果存在非数字字符，打印出来
if not invalid_date.empty:
    print(f"Invalid values in 'Date' column: {invalid_date['Date'].unique()}")
if not invalid_time.empty:
    print(f"Invalid values in 'Time' column: {invalid_time['Time'].unique()}")

# 检查'Date'和'Time'列中的数据格式
print(df[['Date', 'Time']].head())

# 将'Date'和'Time'列合并成一个'datetime'列，并转换为datetime类型
df['datetime'] = pd.to_datetime(df['Date'] + df['Time'], format='%y%m%d%H%M%S')

# 按照blk和datetime排序
df_sorted = df.sort_values(by=['blk', 'datetime'], ascending=[False, True])

# 保存排序后的DataFrame到新的CSV文件
df_sorted.to_csv('log_sorted.csv', index=False, quoting=1)  # quoting=1 是为了确保Content列被正确地引用


     Date    Time
0  081109  203518
1  081109  203518
2  081109  203519
3  081109  203519
4  081109  203519


In [4]:
import pandas as pd

# 读取已排序的日志文件
df = pd.read_csv('../output/hdfs/log_sorted.csv', quotechar='"')

# 获取前100个唯一的blk值
top_blk_values = df['blk'].unique()[:100]

# 选择与这些blk值相关联的所有行
df_top_blk = df[df['blk'].isin(top_blk_values)]

df_top_blk = df_top_blk.drop(columns=['EventId'])

# 保存到新的CSV文件
df_top_blk.to_csv('../output/hdfs/log_sorted_100.csv', index=False, quoting=1)

In [22]:
df_top_blk.head(5)

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,blk,datetime
0,3958665,81110,150701,12821,INFO,dfs.DataNode$DataXceiver,Receiving block blk_9223343671363935345 src: /...,9223343671363935345,2008-11-10 15:07:01
1,3958685,81110,150701,12955,INFO,dfs.DataNode$DataXceiver,Receiving block blk_9223343671363935345 src: /...,9223343671363935345,2008-11-10 15:07:01
2,3958720,81110,150701,13063,INFO,dfs.DataNode$DataXceiver,Receiving block blk_9223343671363935345 src: /...,9223343671363935345,2008-11-10 15:07:01
3,3958800,81110,150701,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.allocateBlock: /user/root/so...,9223343671363935345,2008-11-10 15:07:01
4,3961750,81110,150724,12822,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_92233436713639...,9223343671363935345,2008-11-10 15:07:24


In [19]:
# 创建一个空的DataFrame来存储结果
results_df = pd.DataFrame(columns=['Log', 'ClusterID'])

# 迭代df_top_blk['Content']中的每一行
for index, log_line in df_top_blk['Content'].iteritems():
    # 使用template_miner进行匹配
    result = template_miner.match(log_line.strip())
    
    # 检查匹配结果是否为None，并打印/存储结果
    if result is not None:
        # print(f"Log: {log_line}")
        # print(f"Matched Cluster ID: {result.cluster_id}\n")
        
        # 将结果添加到results_df中
        new_row = pd.DataFrame({
            'Log': [log_line],
            'ClusterID': [result.cluster_id]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)
    else:
        print(f"No template matched for log: {log_line}\n")
        
results_df.head(5)

Unnamed: 0,Log,ClusterID
0,Receiving block blk_9223343671363935345 src: /...,46
1,Receiving block blk_9223343671363935345 src: /...,46
2,Receiving block blk_9223343671363935345 src: /...,46
3,BLOCK* NameSystem.allocateBlock: /user/root/so...,47
4,PacketResponder 1 for block blk_92233436713639...,48
