In [None]:
from collections import Counter

import matplotlib.pyplot as plt
from ipynb.fs.full.term_frequency import get_eng_words_vector_from_file

In [None]:
def is_word(string):
    for i in string:
        if not i.isalpha():
            return False
    return True

def get_unigram_counter_words_from_all_words_file(filename_prefix):
    unigram_counter = Counter()
    with open('../../' + filename_prefix + '-all-words.txt', mode='r') as file:
        for word in file:
            first_word = word.strip()
            if is_word(first_word):
                unigram_counter[first_word] += 1
    return unigram_counter

In [None]:
def get_bigram_counter_from_all_words_file(filename_prefix):
    eng_words = set(get_eng_words_vector_from_file(filename_prefix + '-all-words.txt'))
    bigram_counter = Counter()
    with open('../../' + filename_prefix + '-all-words.txt', mode='r') as file:
        first_word = file.__next__().strip()
        for word in file:
            second_word = word.strip()
            if first_word in eng_words and second_word in eng_words:
                bigram_counter[first_word + " " + second_word] += 1
            first_word = second_word
    return bigram_counter

In [None]:
def get_bigram_counter_words_from_all_words_file(filename_prefix):
    bigram_counter = Counter()
    with open('../../' + filename_prefix + '-all-words.txt', mode='r') as file:
        first_word = file.__next__().strip()
        for word in file:
            second_word = word.strip()
            if is_word(first_word) and is_word(second_word):
                bigram_counter[first_word + " " + second_word] += 1
            first_word = second_word
    return bigram_counter

In [None]:
def get_trigram_counter_from_all_words_file(filename_prefix):
    eng_words = set(get_eng_words_vector_from_file(filename_prefix + '-all-words.txt'))
    counter = Counter()
    with open('../../' + filename_prefix + '-all-words.txt', mode='r') as file:
        first_word = file.__next__().strip()
        second_word = file.__next__().strip()
        for word in file:
            third_word = word.strip()
            if first_word in eng_words and second_word in eng_words and third_word in eng_words:
                counter[first_word + " " + second_word + " " + third_word] += 1
            first_word = second_word
            second_word = third_word
    return counter

def get_trigram_counter_words_from_all_words_file(filename_prefix):
    counter = Counter()
    with open('../../' + filename_prefix + '-all-words.txt', mode='r') as file:
        first_word = file.__next__().strip()
        second_word = file.__next__().strip()
        for word in file:
            third_word = word.strip()
            if is_word(first_word) and is_word(second_word) and is_word(third_word):
                counter[first_word + " " + second_word + " " + third_word] += 1
            first_word = second_word
            second_word = third_word
    return counter

In [20]:
def get_fourgram_counter_words_from_all_words_file(filename_prefix):
    counter = Counter()
    with open('../../' + filename_prefix + '-all-words.txt', mode='r') as file:
        first_word = file.__next__().strip()
        second_word = file.__next__().strip()
        third_word = file.__next__().strip()
        for word in file:
            fourth_word = word.strip()
            if is_word(first_word) and is_word(second_word) and is_word(third_word) and is_word(fourth_word):
                counter[first_word + " " + second_word + " " + third_word + " " + fourth_word] += 1
            first_word = second_word
            second_word = third_word
            third_word = fourth_word
    return counter


In [None]:
def plot_counter(cc):
    plt.figure(figsize=(10, 6))
    plt.plot(
        [*map(lambda x: get_count(*x), cc.most_common(25))]
    )
    
def get_count(x, y):
    return y

In [None]:
get_bigram_counter_words_from_all_words_file('frontend').most_common(40)

In [None]:
get_bigram_counter_from_all_words_file('timepot').most_common(60)

In [None]:
get_bigram_counter_words_from_all_words_file('braintree').most_common(10)

In [None]:
plot_counter(get_bigram_counter_from_all_words_file('frontend'))

In [None]:
plot_counter(get_bigram_counter_from_all_words_file('timepot'))

In [None]:
plot_counter(get_bigram_counter_from_all_words_file('braintree'))

In [8]:
message_unigrams = get_unigram_counter_words_from_all_words_file('message')

KeyboardInterrupt: 

In [10]:
def compute_bigram_probabilities(unigrams, bigrams):    
    bigram_probabilities = Counter()    
    for first_word in unigrams:
        for second_word in unigrams:
            prob = bigrams[first_word + " " + second_word] / unigrams[first_word]
            if prob > 0:
                bigram_probabilities[second_word + ' | ' + first_word] = prob
    
    return bigram_probabilities

In [30]:
unigrams = get_unigram_counter_words_from_all_words_file('timepot') + \
           get_unigram_counter_words_from_all_words_file('frontend') + \
           get_unigram_counter_words_from_all_words_file('braintree')
bigrams = get_bigram_counter_words_from_all_words_file('timepot') + \
           get_bigram_counter_words_from_all_words_file('frontend') + \
           get_bigram_counter_words_from_all_words_file('braintree')
total_bigram_probability_counter = compute_bigram_probabilities(unigrams, bigrams)

count = 0
for w in total_bigram_probability_counter:
    if total_bigram_probability_counter[w] == 1.0:
        count += 1

print(len(set(total_bigram_probability_counter)))
print(count)

889
264


In [35]:
words = Counter()
failure_words = ['error', 'fail', 'failure', 'failed', 'unavailable', 'false', 'shutdown', 'exception', 'abnormally']
for w in total_bigram_probability_counter:
    for fw in failure_words:
        if w.endswith(fw):
            words[w] = total_bigram_probability_counter[w]
            
words.most_common()
            

[('with | failed', 0.673828125),
 ('to | failed', 0.01953125),
 ('is | exception', 0.015151515151515152),
 ('occurred | error', 0.0107981220657277),
 ('will | exception', 0.005050505050505051),
 ('validation | failed', 0.0048828125),
 ('for | failed', 0.00390625),
 ('updating | error', 0.003755868544600939),
 ('may | error', 0.003755868544600939),
 ('in | failure', 0.0027643400138217),
 ('while | error', 0.0018779342723004694),
 ('has | error', 0.0018779342723004694),
 ('trying | exception', 0.0012626262626262627),
 ('reading | error', 0.0009389671361502347)]

In [None]:
list(filter(lambda x: x[1] < 1.0, total_bigram_probability_counter.most_common()))[:20]

In [None]:
get_trigram_counter_words_from_all_words_file('timepot').most_common(40)

In [None]:
get_fourgram_counter_words_from_all_words_file('timepot').most_common(40)