In [40]:
from collections import defaultdict
from datetime import datetime
import pandas as pd
import re

In [41]:
def parse_log_file(file_name):
    data = defaultdict(list)
    date_pattern = r'(?P<time>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)'
    placement_pattern = r': rank (?P<id>\S+) on (?P<core>\S+): '
    msg_pattern = r'(?P<msg>.+)'
    log_regex = re.compile(date_pattern + placement_pattern + msg_pattern)
    with open(file_name) as file:
        for line in file:
            line = line.strip()
            if 'running' in line or 'success' in line:
                continue
            else:
                matches = log_regex.search(line)
                if matches is not None:
                    data['timestamp'].append(datetime.strptime(matches.group('time'), '%Y-%m-%d %H:%M:%S.%f'))
                    rank, thread_id = matches.group('id').split('#')
                    data['rank'].append(int(rank))
                    data['thread_id'].append(int(thread_id))
                    core, host = matches.group('core').split('@')
                    data['core'].append(int(core))
                    data['host'].append(host)
                    data['event_type'].append('allocate' if 'alloc' in matches.group('msg') else 'fill')
        return pd.DataFrame(data)

In [51]:
log_data = parse_log_file('log.txt')

In [52]:
log_data

Unnamed: 0,timestamp,rank,thread_id,core,host,event_type
0,2022-08-29 10:20:02.885681,0,0,0,metatron,allocate
1,2022-08-29 10:20:02.885684,0,1,1,metatron,allocate
2,2022-08-29 10:20:02.885743,0,0,0,metatron,fill
3,2022-08-29 10:20:02.885765,0,1,1,metatron,fill
4,2022-08-29 10:20:02.885780,1,0,2,metatron,allocate
5,2022-08-29 10:20:02.885910,1,0,2,metatron,fill
6,2022-08-29 10:20:02.885781,2,0,4,metatron,allocate
7,2022-08-29 10:20:02.885784,2,1,5,metatron,allocate
8,2022-08-29 10:20:02.885924,2,0,4,metatron,fill
9,2022-08-29 10:20:02.885937,2,1,5,metatron,fill


In [53]:
log_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   timestamp   60 non-null     datetime64[ns]
 1   rank        60 non-null     int64         
 2   thread_id   60 non-null     int64         
 3   core        60 non-null     int64         
 4   host        60 non-null     object        
 5   event_type  60 non-null     object        
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 2.9+ KB


In [56]:
log_data[['rank', 'thread_id', 'host', 'core']].drop_duplicates() \
    .groupby(['rank', 'thread_id', 'host']) \
    .aggregate(lambda tdf: tdf.unique().tolist())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,core
rank,thread_id,host,Unnamed: 3_level_1
0,0,metatron,[0]
0,1,metatron,[1]
1,0,metatron,[2]
1,1,metatron,[3]
2,0,metatron,[4]
2,1,metatron,[5]
