In [1]:
import queue
from collections import defaultdict

import nltk
%matplotlib inline

from ipynb.fs.full.log_entries import get_all_log_entries

In [2]:
def full_group_by(l, key=lambda x: x):
    d = defaultdict(list)
    for item in l:
        d[key(item)].append(item)
    return d.items()

In [None]:
def is_complete(token_position_cardinalities, group_uniqueness_threshold):
    group_len = len(token_position_cardinalities)
    complete_token_positions = 0
    for token_position_cardinality in token_position_cardinalities:
        if token_position_cardinality == 1:
            complete_token_positions += 1
    group_determinant = complete_token_positions / group_len
    if group_determinant > group_uniqueness_threshold:
        return True
    else:
        return False
    
def count_unique_tokens_at_each_token_position(group):
    group_len = len(group[0])
    token_position_cardinality = []
    for token_position in range(group_len):
        unique_tokens_at_current_position = {tokens[token_position] for tokens in group}
        token_position_cardinality.append(len(unique_tokens_at_current_position))
    return token_position_cardinality

def convert_to_msg_template(group, token_position_cardinalities):
    group_representative = group[0]
    msg_template = []
    for i in range(len(group_representative)):
        if token_position_cardinalities[i] == 1:
            msg_template.append(group_representative[i])
        else:
            msg_template.append('*')
    return group, len(group)

def determine_log_msg_templates(tokenized_msg_groups_by_length, 
                                group_uniqueness_threshold,
                                absolute_threshold,
                                relative_threshold):
    
    incomplete_groups_queue = queue.Queue()
    for group in tokenized_msg_groups_by_length:
        incomplete_groups_queue.put(group)
    
    complete_groups = []
    incomplete_groups = []
    while not incomplete_groups_queue.empty():
        current_group = incomplete_groups_queue.get()
        token_position_cardinalities = count_unique_tokens_at_each_token_position(current_group)
        if is_complete(token_position_cardinalities, group_uniqueness_threshold):
            complete_groups.append(convert_to_msg_template(current_group, token_position_cardinalities))
        else:
            lowest_token_position_cardinality = min(token_position_cardinalities)
            split_token_position = token_position_cardinalities.index(lowest_token_position_cardinality)
            AT = lowest_token_position_cardinality
            RT = lowest_token_position_cardinality / len(current_group)
            if AT > absolute_threshold and RT > relative_threshold:
                complete_groups.append(convert_to_msg_template(current_group, token_position_cardinalities))
            else:
                grouped_by_split_token_position = full_group_by(current_group, lambda x: x[split_token_position])
                for subgroup in grouped_by_split_token_position:
                    subgroup_elements = subgroup[1]
                    subgroup_token_position_cardinalities = count_unique_tokens_at_each_token_position(subgroup_elements)
                    if is_complete(subgroup_token_position_cardinalities, group_uniqueness_threshold):
                        complete_groups.append(convert_to_msg_template(subgroup_elements, subgroup_token_position_cardinalities))
                    else:
                        if len(grouped_by_split_token_position) > 1:
                            incomplete_groups_queue.put(subgroup_elements)
                        else:
                            incomplete_groups.append(convert_to_msg_template(subgroup_elements, subgroup_token_position_cardinalities))
    return complete_groups, incomplete_groups

def determine_log_msg_templates_from_logs_starting_with(filename_prefix,
                                group_uniqueness_threshold = 0.6,
                                absolute_threshold = 10,
                                relative_threshold = 0.1):
    entries = get_all_log_entries(filename_prefix)
    tokenized_msgs = [*map(lambda x: nltk.word_tokenize(x['msg']), entries)]
    grouped_by_length = [entry[1] for entry in full_group_by(tokenized_msgs, lambda x: len(x))]
    return determine_log_msg_templates(grouped_by_length, 
                                       group_uniqueness_threshold = group_uniqueness_threshold,
                                       absolute_threshold = absolute_threshold,
                                       relative_threshold = relative_threshold)