# Preparation 

In [1]:
!pip install drain3

Collecting drain3
  Downloading drain3-0.9.6.tar.gz (22 kB)
Collecting jsonpickle==1.5.1
  Downloading jsonpickle-1.5.1-py2.py3-none-any.whl (37 kB)
Collecting cachetools==4.2.1
  Downloading cachetools-4.2.1-py3-none-any.whl (12 kB)
Building wheels for collected packages: drain3
  Building wheel for drain3 (setup.py) ... [?25l[?25hdone
  Created wheel for drain3: filename=drain3-0.9.6-py3-none-any.whl size=18869 sha256=4fe02d6027bfd01207d5b88269067ef3167e47a15c60e8e9bdcbd312df59704f
  Stored in directory: /root/.cache/pip/wheels/1c/de/75/b3158b4ef2fffbdbb75cfceb706c645d27b19e5d8cb0eec180
Successfully built drain3
Installing collected packages: jsonpickle, cachetools, drain3
  Attempting uninstall: cachetools
    Found existing installation: cachetools 4.2.2
    Uninstalling cachetools-4.2.2:
      Successfully uninstalled cachetools-4.2.2
Successfully installed cachetools-4.2.1 drain3-0.9.6 jsonpickle-1.5.1


In [2]:
import re
import pandas as pd

from drain3 import TemplateMiner
import re
from drain3.template_miner_config import TemplateMinerConfig
from multiprocessing import Pool
from tqdm import tqdm_notebook
from sklearn.metrics import precision_recall_fscore_support

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions defintion

In [3]:
def generate_logformat_regex(logformat):
    """ Function to generate regular expression to split log messages
    """
    headers = []
    splitters = re.split(r'(<[^<>]+>)', logformat)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += '(?P<%s>.*?)' % header
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex


# Train Log parsing object : **tm**
#function of tm: we input a log message into **tm**, **tm** return template id of it
def baseline(Path,headers,regex,line_number):
    df = []
    
    #-----
    #parsing all logs, try to get all templates as precisely as possible 
    #-----
    with open(Path,'r') as data:
        for i in tqdm_notebook(range(line_number)):
            # this fixed number is to avoid endless loop (only for code running on colab)
            line = data.readline()
            try:
                match = regex.search(line.strip())
            except:
                continue
            tmp = [re.findall(r'blk_-?\d+',line)[0]]
            for i in range(len(headers)):
                tmp.append(match.group(headers[i]))
            df.append(tmp)

        headers = ['Id']+ headers
        return pd.DataFrame(df,columns=headers)

            

            


# Log Parsing

## load data

In [4]:
# Data preprocessing
# BGL
#!cp ./drive/MyDrive/thunderbird/BGL.tar.gz .
#!tar xf BGL.tar.gz
#!wc -l BGL.log

#HDFS
format_string = '<Date> <Time> <Pid> <Level> <Component>: <Content>'
src = 'HDFS.log'
dst = './drive/MyDrive/hdfs/'
!cp ./drive/MyDrive/hdfs/HDFS_1.tar.gz .
!tar xf HDFS_1.tar.gz
!wc -l HDFS.log

11175629 HDFS.log


## first parsing with regular expression

In [5]:
headers,regex = generate_logformat_regex(format_string)
df = baseline(src,headers,regex,11175629)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/11175629 [00:00<?, ?it/s]

# Visualization for thesis

In [None]:
df.Date = df.Date.astype(str)
df.Time = df.Time.astype(str)
df['DateTime'] = pd.to_datetime(df.Date + df.Time, format='%m%d%y%H%M%S')

anomaly = pd.read_csv('anomaly_label.csv')
anomaly.set_index(anomaly.columns[0],inplace = True)
#df = df.set_index('Id')
df2 = df.merge(anomaly, left_index= True ,right_index= True)

In [None]:
df2['DateTime'] = pd.to_datetime(df2.Date + df2.Time, format='%y%m%d%H%M%S')
df2.sort_values('DateTime',inplace = True)
df4 = df2[df2.Label == 'Normal']

In [None]:
df2 = df2[df2.Label != 'Normal']
df3 = df2.groupby('index')['Content'].agg(list)

In [None]:
df4.reset_index(inplace = True)
df4 = df4.groupby('index')['Content'].agg(list)

In [None]:
df4.loc['blk_-1016453873803095686'] #600

['BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811101024_0008/job.jar. blk_-1016453873803095686',
 'BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.71.97:50010 is added to blk_-1016453873803095686 size 91178',
 'PacketResponder 0 for block blk_-1016453873803095686 terminating',
 'Receiving block blk_-1016453873803095686 src: /10.251.106.10:35842 dest: /10.251.106.10:50010',
 'BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.106.10:50010 is added to blk_-1016453873803095686 size 91178',
 'BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.73.220:50010 is added to blk_-1016453873803095686 size 91178',
 'Received block blk_-1016453873803095686 of size 91178 from /10.250.19.102',
 'Receiving block blk_-1016453873803095686 src: /10.251.71.97:52272 dest: /10.251.71.97:50010',
 'PacketResponder 1 for block blk_-1016453873803095686 terminating',
 'Received block blk_-1016453873803095686 of size 91178 from /10.251.71.97',
 'PacketResponder 2 for

In [None]:
df4[df4.astype('str').str.contains('transfer')]

index
blk_-100589041704503944     [Receiving block blk_-100589041704503944 src: ...
blk_-1016453873803095686    [BLOCK* NameSystem.allocateBlock: /mnt/hadoop/...
blk_-1029209534735263783    [Receiving block blk_-1029209534735263783 src:...
blk_-1060774765824706733    [BLOCK* NameSystem.allocateBlock: /user/root/r...
blk_-1094721103807656451    [Receiving block blk_-1094721103807656451 src:...
                                                  ...                        
blk_941820582974059667      [Receiving block blk_941820582974059667 src: /...
blk_949040098671534306      [BLOCK* NameSystem.allocateBlock: /user/root/r...
blk_954443842682446851      [Receiving block blk_954443842682446851 src: /...
blk_967210710264965004      [Receiving block blk_967210710264965004 src: /...
blk_984232472855927874      [Receiving block blk_984232472855927874 src: /...
Name: Content, Length: 1779, dtype: object

In [None]:
df3[df3.astype('str').str.contains('exception while serving')]

index
blk_-1001299764911418845    [BLOCK* NameSystem.allocateBlock: /user/root/r...
blk_-1009207079038502874    [BLOCK* NameSystem.allocateBlock: /user/root/r...
blk_-101482901886772364     [Receiving block blk_-101482901886772364 src: ...
blk_-1018108268208665701    [BLOCK* NameSystem.allocateBlock: /user/root/r...
blk_-1022089499698905472    [BLOCK* NameSystem.allocateBlock: /user/root/r...
                                                  ...                        
blk_945842064773478209      [Receiving block blk_945842064773478209 src: /...
blk_956110349654312915      [Receiving block blk_956110349654312915 src: /...
blk_963904426957071019      [Receiving block blk_963904426957071019 src: /...
blk_971772404122301304      [BLOCK* NameSystem.allocateBlock: /user/root/r...
blk_989409441141247289      [Receiving block blk_989409441141247289 src: /...
Name: Content, Length: 3178, dtype: object

In [None]:
df3.loc['blk_945842064773478209']#'blk_-101482901886772364'

['Receiving block blk_945842064773478209 src: /10.250.15.198:49679 dest: /10.250.15.198:50010',
 'BLOCK* NameSystem.allocateBlock: /user/root/rand/_temporary/_task_200811101024_0001_m_000165_0/part-00165. blk_945842064773478209',
 'Receiving block blk_945842064773478209 src: /10.250.15.198:32904 dest: /10.250.15.198:50010',
 'Receiving block blk_945842064773478209 src: /10.250.10.223:53598 dest: /10.250.10.223:50010',
 'Received block blk_945842064773478209 of size 67108864 from /10.250.15.198',
 'PacketResponder 0 for block blk_945842064773478209 terminating',
 'PacketResponder 2 for block blk_945842064773478209 terminating',
 'BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.107.98:50010 is added to blk_945842064773478209 size 67108864',
 'Received block blk_945842064773478209 of size 67108864 from /10.250.15.198',
 'BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.10.223:50010 is added to blk_945842064773478209 size 67108864',
 'Received block blk_945842064773478

In [None]:
df3.loc['blk_-1377816241829109371']

['Receiving block blk_-1377816241829109371 src: /10.251.89.155:52440 dest: /10.251.89.155:50010',
 'BLOCK* NameSystem.allocateBlock: /user/root/randtxt/_temporary/_task_200811092030_0003_m_000093_0/part-00093. blk_-1377816241829109371']

In [None]:
df2.groupby('index')['']

In [None]:
for i in list(df[df['Id']== df['Id'][10086]]['Content']):
    print(i)

Receiving block blk_7125954578896252242 src: /10.251.39.64:52175 dest: /10.251.39.64:50010
Receiving block blk_7125954578896252242 src: /10.251.39.64:58769 dest: /10.251.39.64:50010
BLOCK* NameSystem.allocateBlock: /user/root/rand/_temporary/_task_200811092030_0001_m_000152_0/part-00152. blk_7125954578896252242
Receiving block blk_7125954578896252242 src: /10.251.70.5:45337 dest: /10.251.70.5:50010
PacketResponder 1 for block blk_7125954578896252242 terminating
PacketResponder 2 for block blk_7125954578896252242 terminating
Received block blk_7125954578896252242 of size 67108864 from /10.251.39.64
Received block blk_7125954578896252242 of size 67108864 from /10.251.39.64
PacketResponder 0 for block blk_7125954578896252242 terminating
Received block blk_7125954578896252242 of size 67108864 from /10.251.70.5
BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.39.64:50010 is added to blk_7125954578896252242 size 67108864
BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.31

 ## Second parsing

## second parsing with drain3


In [6]:
tc = TemplateMinerConfig()
tc.load('./drive/MyDrive/hdfs/drain3_BGL.ini') # this is config file, use the same setting for both BGL and Thunderbird 
tm=TemplateMiner(config=tc)
df['template_id'] = df['Content'].apply(lambda x: tm.add_log_message(x)['cluster_id'] )
df.to_csv('/content/drive/MyDrive/hdfs/parsing_result',index = False)

total          : took    26.23 s (100.00%),  1,123,520 samples,   23.34 ms / 1000 samples,       42,835.80 hz
drain          : took    19.18 s ( 73.14%),  1,123,520 samples,   17.08 ms / 1000 samples,       58,564.88 hz
tree_search    : took     8.10 s ( 30.90%),  1,123,520 samples,    7.21 ms / 1000 samples,      138,631.78 hz
cluster_exist  : took     4.03 s ( 15.37%),  1,123,481 samples,    3.59 ms / 1000 samples,      278,701.66 hz
mask           : took     1.04 s (  3.98%),  1,123,520 samples,    0.93 ms / 1000 samples,    1,077,558.55 hz
create_cluster : took     0.00 s (  0.00%),         39 samples,   18.54 ms / 1000 samples,       53,932.69 hz
total          : took    52.71 s (100.00%),  2,274,195 samples,   23.18 ms / 1000 samples,       43,146.07 hz
drain          : took    38.51 s ( 73.07%),  2,274,195 samples,   16.93 ms / 1000 samples,       59,051.30 hz
tree_search    : took    16.28 s ( 30.88%),  2,274,195 samples,    7.16 ms / 1000 samples,      139,718.98 hz
cluster_ex

In [7]:
# get all template
templates = [' '.join(tm.drain.id_to_cluster[i+1].log_template_tokens) for i in range(51)]

In [15]:
pip install textdistance

Collecting textdistance
  Downloading textdistance-4.2.1-py3-none-any.whl (28 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.2.1


In [26]:
import textdistance

threshold = 2
too_similar = []
for i in range(len(templates)):
    for j in range(i+1,len(templates)):
        m = textdistance.levenshtein.distance(templates[i].split(' '), templates[j].split(' '))
        if m<=threshold:
            too_similar.append((templates[i],templates[j],m))

In [27]:
too_similar

[('<*> block blk <*> to <*>', '<*> Served block blk <*> to <*>', 1),
 ('<*> block blk <*> to <*>', '<*> writing block blk <*> to mirror <*>', 2),
 ('<*> Starting thread to transfer block blk <*> to <*> <*>',
  '<*> Starting thread to transfer block blk <*> to <*>',
  1),
 ('BLOCK* ask <*> to replicate blk <*> to datanode(s) <*> <*>',
  'BLOCK* ask <*> to replicate blk <*> to datanode(s) <*>',
  1),
 ('<*> Served block blk <*> to <*>',
  '<*> writing block blk <*> to mirror <*>',
  2),
 ('BLOCK* NameSystem.allocateBlock: <*> logs/history/ip-10-250-19-102.ec2.internal <*> job <*> <*> conf.xml. blk <*>',
  'BLOCK* NameSystem.allocateBlock: <*> logs/history/ip-10-250-19-102.ec2.internal <*> job <*> <*> root <*> blk <*>',
  2),
 ('BLOCK* NameSystem.allocateBlock: <*> temporary/ task 200811092030 <*> <*> <*> <*> blk <*>',
  'BLOCK* NameSystem.allocateBlock: <*> temporary/ task 200811101024 <*> <*> <*> <*> blk <*>',
  1),
 ('writeBlock blk <*> received exception <*>',
  'writeBlock blk <*> re

In [None]:
df

Unnamed: 0,Id,Date,Time,Pid,Level,Component,Content,template_id
0,blk_-1608999687919862906,081109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,1
1,blk_-1608999687919862906,081109,203518,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.allocateBlock: /mnt/hadoop/m...,2
2,blk_-1608999687919862906,081109,203519,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,1
3,blk_-1608999687919862906,081109,203519,145,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,1
4,blk_-1608999687919862906,081109,203519,145,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_-1608999687919...,3
...,...,...,...,...,...,...,...,...
11175624,blk_-6171368032583208892,081111,111557,13,INFO,dfs.DataBlockScanner,Verification succeeded for blk_-61713680325832...,15
11175625,blk_6195025276114316035,081111,111607,13,INFO,dfs.DataBlockScanner,Verification succeeded for blk_619502527611431...,15
11175626,blk_-3339773404714332088,081111,111613,13,INFO,dfs.DataBlockScanner,Verification succeeded for blk_-33397734047143...,15
11175627,blk_1037231945509285002,081111,111615,13,INFO,dfs.DataBlockScanner,Verification succeeded for blk_103723194550928...,15


In [None]:
#df = pd.read_csv('/content/drive/MyDrive/hdfs/parsing_result')
df.Date = df.Date.astype(str)
df.Time = df.Time.astype(str)
df['DateTime'] = pd.to_datetime(df.Date + df.Time, format='%m%d%y%H%M%S')
#df['Id'] = df['Id'].apply(lambda x: eval(x)[0])

In [None]:
def build_seq(df,session_id):
    tmp = df[[session_id,'template_id','DateTime']]
    tmp = tmp.groupby(session_id).agg({'template_id': lambda x: x.to_list(),'DateTime': 'max'})
    return tmp

In [None]:
hdfs_seqs = build_seq(df,'Id')
hdfs_seqs.reset_index(inplace = True)
hdfs_seqs.sort_values('DateTime',inplace = True)
hdfs_seqs.to_csv('./drive/MyDrive/hdfs/'+'sequences_result.csv',index = False)

In [None]:
import pandas as pd
hdfs_seqs = pd.read_csv('./drive/MyDrive/hdfs/'+'sequences_result.csv')

Unnamed: 0,Id,template_id,DateTime
532722,blk_808821427005349208,"[12, 1]",1900-01-01 01:00:04
257419,blk_-8426563416414717720,"[12, 1, 1, 18]",1900-11-01 00:01:05
450176,blk_5699756358408958768,"[12, 1]",1900-11-01 00:01:05
37167,blk_-2071456919707338634,"[12, 1, 1, 18]",1900-11-01 00:02:02
83413,blk_-3408464434262389388,"[12, 1, 1, 18]",1900-11-01 00:02:05
...,...,...,...
476151,blk_6458501057260729182,"[1, 1, 1, 43, 3, 4, 3, 4, 3, 4, 5, 5, 5, 10, 1...",1900-11-11 23:55:09
473044,blk_6367016609316899824,"[1, 43, 1, 1, 3, 4, 3, 4, 3, 4, 5, 5, 5, 35, 3...",1900-11-11 23:55:09
476076,blk_6456197171151999317,"[43, 1, 1, 1, 3, 4, 3, 4, 3, 4, 5, 5, 5, 10, 1...",1900-11-11 23:55:09
175111,blk_-6058986837873837461,"[1, 1, 1, 43, 3, 4, 3, 4, 3, 4, 5, 5, 5, 35, 3...",1900-11-11 23:55:09


In [None]:
# add anomaly label
anomaly = pd.read_csv('anomaly_label.csv')
anomaly.set_index(anomaly.columns[0],inplace = True)
hdfs_seqs.set_index('Id',inplace = True)
hdfs_seqs = hdfs_seqs.merge(anomaly,left_index=True,right_index=True,how='inner')
hdfs_seqs.sort_values('DateTime',inplace = True)

NameError: ignored

In [None]:
#generate hdfs version:

#train data first 1% data
sample = hdfs_seqs.iloc[:int(0.01*len(hdfs_seqs))]
# train_clean
sample_clean = sample[sample.Label != 'Anomaly']
with open('./drive/MyDrive/hdfs/'+'hdfs_train_clean.txt','w') as data:
    for i in sample_clean.template_id.to_list():
        data.write(' '.join([str(j) for j in i])+'\n')
# train_dirty
sample_dirty = sample[sample.Label == 'Anomaly']
with open('./drive/MyDrive/hdfs/'+'hdfs_train_dirty.txt','w') as data:
    for i in sample_dirty.template_id.to_list():
        data.write(' '.join([str(j) for j in i])+'\n')

In [None]:
test = hdfs_seqs.iloc[int(0.01*len(hdfs_seqs)):]

test_clean = test[test.Label != 'Anomaly']
with open('./drive/MyDrive/hdfs/'+'hdfs_test_clean.txt','w') as data:
    for i in test_clean.template_id.to_list():
        data.write(' '.join([str(j) for j in i])+'\n')
# train_dirty
test_dirty = test[test.Label == 'Anomaly']
with open('./drive/MyDrive/hdfs/'+'hdfs_test_dirty.txt','w') as data:
    for i in test_dirty.template_id.to_list():
        data.write(' '.join([str(j) for j in i])+'\n')

In [None]:
# our hdfs
p = set([])
with open('./drive/MyDrive/hdfs/'+'hdfs_train_clean.txt') as data:
    for i in data:       
        p.add(i)
    print(len(p))

with open('./drive/MyDrive/hdfs/'+'hdfs_train_dirty.txt') as data:
    for i in data:       
        p.add(i)
    print(len(p))

with open('./drive/MyDrive/hdfs/'+'hdfs_test_clean.txt') as data:
    for i in data:
        p.add(i)
    print(len(p))

with open('./drive/MyDrive/hdfs/'+'hdfs_test_dirty.txt') as data:
    for i in data:
        p.add(i)
    print(len(p))

677
751
15533
19653


In [None]:
# deeplog hdfs
p = set([])
with open('/content/drive/MyDrive/DeepLog/data/hdfs_train') as data:
    for i in data:       
        p.add(i)
    print(len(p))

with open('/content/drive/MyDrive/DeepLog/data/hdfs_test_normal') as data:
    for i in data:
        p.add(i)
    print(len(p))

with open('/content/drive/MyDrive/DeepLog/data/hdfs_test_abnormal') as data:
    for i in data:
        p.add(i)
    print(len(p))

840
14261
18375


In [None]:
df

Unnamed: 0,Id,Date,Time,Pid,Level,Component,Content,template_id,DateTime,Label


In [None]:
import pandas as pd
import re

In [None]:
df = bgl_df
#mostly log are ordered by time

In [None]:
df['Time']=pd.to_datetime(df['Time'],format='%Y-%m-%d-%H.%M.%S.%f')

In [None]:
df.sort_values('Time').index

Int64Index([      0,       1,       2,       3,       4,       5,       6,
                  7,       8,       9,
            ...
            4713483, 4713484, 4713485, 4713486, 4713487, 4713488, 4713489,
            4713490, 4713491, 4713492],
           dtype='int64', length=4713493)

4

In [None]:
x=re.search(r'[0-9]{4}(-[0-9]{2}){3}(.\w+){3}',logs)

In [None]:
x.group(0)

'2005-06-03-15.42.50.363779'

In [None]:
new_dataset=df[['log_event','label','t_id']].groupby('t_id').agg(list) 
# mostly log are order by time
# we can think it is seq 

In [None]:
new_dataset

Unnamed: 0_level_0,log_event,label
t_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1117838570,"[1, 1, 1, 1, 1]","[-, -, -, -, -]"
1117838571,"[1, 1, 1, 1, 1, 1]","[-, -, -, -, -, -]"
1117838572,"[1, 1, 1, 1, 1, 1, 1]","[-, -, -, -, -, -, -]"
1117838573,"[1, 1, 1, 1, 1, 1]","[-, -, -, -, -, -]"
1117838574,"[1, 1, 1, 1, 1, 1, 1]","[-, -, -, -, -, -, -]"
...,...,...
1136389303,"[5132, 5133, 5131, 5131, 5131, 5131]","[-, -, -, -, -, -]"
1136389653,"[5132, 5133]","[-, -]"
1136389988,"[5132, 5133]","[-, -]"
1136389989,"[5131, 5131]","[-, -]"


In [None]:
df[df.log_event==900]

Unnamed: 0,log,label,log_event,t_id,severity,log_sub,message
3657673,1130414824 2005.10.27 R42-M1-N3-C:J17-U11 2005...,-,900,1130414824,FATAL,1130414824 2005.10.27 R42-M1-N3-C:J17-U11 2005...,24:01cdf820 25:01cde9e0 26:027d6500 27:003367...


In [None]:
m=tm.drain.id_to_cluster[900].get_template

In [None]:
m.get_template()

'24:01cdf820 25:01cde9e0 26:027d6500 27:00336700'

In [None]:
b=list(df.loc[:5,'log'])
b

['1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.363779 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.527847 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.675872 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.823719 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.982731 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '1117838571 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.51.131467 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n']

In [None]:
k=list(df.loc[4747960:,'log'])
k

['1136390405 2006.01.04 R30-M0-NC-I:J18-U11 2006-01-04-08.00.05.186013 R30-M0-NC-I:J18-U11 RAS KERNEL FATAL idoproxy communication failure: socket closed\n',
 '1136390405 2006.01.04 R31-M0-NC-I:J18-U11 2006-01-04-08.00.05.204230 R31-M0-NC-I:J18-U11 RAS KERNEL FATAL idoproxy communication failure: socket closed\n',
 '1136390405 2006.01.04 R34-M0-NC-I:J18-U11 2006-01-04-08.00.05.233639 R34-M0-NC-I:J18-U11 RAS KERNEL FATAL idoproxy communication failure: socket closed\n']

In [None]:
df['t_id'] = df.log.str.findall('[0-9]{6,100}').str[0]

In [None]:
m=df.groupby(['t_id','label']).count()

In [None]:
m.loc[m.log.argmax()]

t_id         1123030909
label                 -
log                 393
log_event           393
Name: 125077, dtype: object

In [None]:
df[df.t_id=='1123030909']

In [None]:
m.reset_index(inplace=True)

In [None]:
m[['t_id','label']].groupby('t_id').count()['label'].

4

In [None]:
bgl_df.index.min()

Timestamp('2005-06-03 22:42:50')

In [None]:
bgl_df.Label.unique()

array(['-', 'APPREAD', 'KERNDTLB', 'KERNMC', 'KERNRTSP', 'KERNSTOR',
       'APPCHILD', 'APPALLOC', 'KERNSOCK', 'MASNORM', 'LINKDISC',
       'KERNNOETH', 'KERNTERM', 'APPUNAV', 'KERNPOW', 'KERNPROG',
       'LINKPAP', 'APPBUSY', 'APPTORUS', 'MASABNORM', 'KERNCON', 'APPSEV',
       'MONPOW', 'MONNULL', 'KERNMNT', 'KERNMNTF', 'KERNMICRO', 'APPRES',
       'LINKIAP', 'KERNPAN', 'KERNEXT', 'KERNFLOAT', 'KERNBIT',
       'KERNRTSA', 'MMCS', 'KERNREC', 'KERNSERV', 'MONILL', 'APPTO',
       'LINKBLL', 'KERNTLBE', 'APPOUT'], dtype=object)

In [None]:
bgl_df.Timestamp = pd.to_datetime(bgl_df.Timestamp,unit='s')
bgl_df.set_index('Timestamp',inplace=True)

In [None]:
window_size=3 # unit hour
step_size=0.2 # unit hour
first_start = bgl_df.index[0]

In [None]:
pd.Timestamp(first_start.timestamp()+3600*20,unit='s')

Timestamp('2005-06-04 18:42:50')

In [None]:
first_start.time()

datetime.time(22, 42, 50)

In [None]:
bgl_df[first_start : pd.Timestamp(first_start.timestamp()+300000000,unit='s')][['Label','Node','log_event']]\
.groupby('Node').agg({'Label':set,'log_event':list}).reset_index().Label.astype('str').unique()

array(["{'-'}", "{'KERNPOW', 'MASABNORM', 'KERNSERV', 'MASNORM', '-'}",
       "{'KERNSTOR', 'KERNRTSP', '-'}",
       "{'KERNSTOR', 'KERNRTSP', '-', 'KERNTERM'}",
       "{'KERNMC', 'KERNSTOR', 'KERNRTSP', '-'}", "{'KERNSTOR', '-'}",
       "{'KERNSTOR', '-', 'KERNTERM'}",
       "{'APPTO', 'APPRES', 'APPOUT', 'KERNMNTF', 'APPCHILD', 'APPREAD', 'KERNMNT', 'APPSEV', 'APPBUSY', 'APPUNAV', '-'}",
       "{'APPTO', 'APPRES', 'KERNMNTF', 'APPCHILD', 'APPREAD', 'KERNMNT', 'APPSEV', 'APPBUSY', 'APPUNAV', '-'}",
       "{'KERNSTOR', 'KERNRTSP', 'KERNTERM', 'KERNDTLB', '-'}",
       "{'KERNSTOR', 'KERNDTLB', '-', 'KERNTERM'}",
       "{'KERNCON', 'APPTO', 'APPRES', 'KERNMNTF', 'APPCHILD', 'APPREAD', 'KERNMNT', 'APPSEV', 'APPBUSY', 'APPUNAV', 'KERNSOCK', '-'}",
       "{'KERNSTOR', '-', 'APPTORUS'}", "{'KERNMC', 'KERNSTOR', '-'}",
       "{'APPRES', 'APPOUT', 'KERNMNTF', 'APPCHILD', 'APPREAD', 'KERNMNT', 'APPSEV', 'APPBUSY', 'APPUNAV', '-'}",
       "{'APPTO', 'APPRES', 'APPOUT', 'KERNMNTF', 'A

In [None]:
def create_log_event_sequence(df,window_size=1,step_size=0.2):
    #window_size=3 # unit hour
    #step_size=0.2 # unit hour
    return_df = []
    # start time, node, event sequence,label
    first_start = df.index.min()
    end_time = df.index.max()
    #end_time = pd.Timestamp(first_start.timestamp()+1000*3600,unit='s')
    start = first_start
    print(end_time)
    report=0
    while True:
        report+=1
        if report==200:
            print(end_time-start, start)
            report=0
        start = pd.Timestamp(start.timestamp()+step_size*3600,unit='s')
        end = pd.Timestamp(start.timestamp()+window_size*3600,unit='s')
        tmp_df = df[start : end]
        if end_time<=start:
            return pd.concat(return_df)
        return_df.append(tmp_df[['Label','Node','log_event']]\
.groupby('Node').agg({'Label':set,'log_event':list}).reset_index())
        

In [None]:
start

NameError: name 'start' is not defined

In [None]:
seq_df = create_log_event_sequence(bgl_df)

2006-01-04 16:00:05
213 days 01:29:15 2005-06-05 14:30:50
211 days 09:29:15 2005-06-07 06:30:50
209 days 17:29:15 2005-06-08 22:30:50
208 days 01:29:15 2005-06-10 14:30:50
206 days 09:29:15 2005-06-12 06:30:50
204 days 17:29:15 2005-06-13 22:30:50
203 days 01:29:15 2005-06-15 14:30:50
201 days 09:29:15 2005-06-17 06:30:50
199 days 17:29:15 2005-06-18 22:30:50
198 days 01:29:15 2005-06-20 14:30:50
196 days 09:29:15 2005-06-22 06:30:50
194 days 17:29:15 2005-06-23 22:30:50
193 days 01:29:15 2005-06-25 14:30:50
191 days 09:29:15 2005-06-27 06:30:50
189 days 17:29:15 2005-06-28 22:30:50
188 days 01:29:15 2005-06-30 14:30:50
186 days 09:29:15 2005-07-02 06:30:50
184 days 17:29:15 2005-07-03 22:30:50
183 days 01:29:15 2005-07-05 14:30:50
181 days 09:29:15 2005-07-07 06:30:50
179 days 17:29:15 2005-07-08 22:30:50
178 days 01:29:15 2005-07-10 14:30:50
176 days 09:29:15 2005-07-12 06:30:50
174 days 17:29:15 2005-07-13 22:30:50
173 days 01:29:15 2005-07-15 14:30:50
171 days 09:29:15 2005-07-17 0

In [None]:
len({'-'}-{'-','f'})

0

In [None]:
seq_df.Label.astype('str').unique()

In [None]:
seq_df.Label= seq_df.Label.apply(lambda x: len(x-{'-'}))

In [None]:
seq_df.to_csv('full_bgl_seq.csv')