In [None]:
import math
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline
import nltk

In [None]:
def number_of_significant_words(counter):
    one_percent = sum(counter.values()) /1000
    number_of = 0
    for c in set(counter):
        if counter[c] > one_percent:
            number_of += 1
    return number_of

In [None]:
def is_word(word):
    for c in word:
        if not c.isalpha():
            return False
    return True

def is_start_of_log_msg(word):
    return word == '<s>'

def is_end_of_log_msg(word):
    return word == '</s>'

def get_df_counter(file_prefix):
    df_counter = Counter()
    log_msg_words = []
    total_count = 0
    eng_words = {*map(lambda x: x.lower(), nltk.corpus.words.words())}
    with open('../../' + file_prefix + '-all-words.txt', mode='r') as file:
        for word in file:
            current_word = word.strip()
            if is_start_of_log_msg(current_word):
                log_msg_words = []
            elif is_end_of_log_msg(current_word):
                for w in set(log_msg_words):
                    df_counter[w] += 1
            elif current_word in eng_words:
                log_msg_words.append(current_word)
                total_count += 1
    return df_counter, total_count

def get_idf_counter(file_prefix):
    df_counter, total_count = get_df_counter(file_prefix)
    idf_counter = Counter()
    for w in df_counter:
        idf_value = math.log10(total_count/df_counter[w])
        idf_counter[w] = idf_value
    return idf_counter


In [None]:
def get_df_bigram_counter(file_prefix):
    df_counter = Counter()
    log_msg_words = []
    total_count = 0
    eng_words = {*map(lambda x: x.lower(), nltk.corpus.words.words())}
    with open('../../' + file_prefix + '-all-words.txt', mode='r') as file:
        first_word = file.__next__().strip()
        for word in file:
            second_word = word.strip()
            if is_start_of_log_msg(first_word):
                log_msg_words = []
            elif is_end_of_log_msg(second_word):
                for w in set(log_msg_words):
                    df_counter[w] += 1
            elif first_word in eng_words and second_word in eng_words:
                log_msg_words.append(first_word + " " + second_word)
                total_count += 1
            first_word = second_word
    return df_counter, total_count

def get_idf_bigram_counter(file_prefix):
    df_counter, total_count = get_df_bigram_counter(file_prefix)
    idf_counter = Counter()
    for w in df_counter:
        idf_value = math.log10(total_count/df_counter[w])
        idf_counter[w] = idf_value
    return idf_counter

In [None]:
def plot_counter(cc):
    plt.figure(figsize=(10, 6))
    plt.grid(True)
    plt.plot(
        [*map(lambda x: get_count(*x), cc.most_common())]
    )
    
def get_count(x, y):
    return y