# Log Analysis with CountVectorizer for probe

In [1]:
import pandas as pd
import plotly.plotly as py

In [57]:
df = pd.read_csv("../input/probe.log.1",sep='\t',index_col=False,
                 header=None,names=['time','comp','handle','level','main','code','line','empty','log_entry'])

In [58]:
df.head()

Unnamed: 0,time,comp,handle,level,main,code,line,empty,log_entry
0,2019-04-29 18:54:37.284190,probe,main,INFO,Main,statistics_tracker.cc,474,,TCP_CLOSED_CONNECTIONS_PER_SECOND - 64376687 instances 8746409 samples
1,2019-04-29 18:54:37.284195,probe,main,INFO,Main,statistics_tracker.cc,474,,TCP_TIMEOUT_CONNECTIONS_PER_SECOND - 2070417 instances 8746409 samples
2,2019-04-29 18:54:37.284202,probe,main,INFO,Main,statistics_tracker.cc,474,,TCP_CONNECTIONS_AVG_DURATION - 25855995974557 instances 64376687 samples
3,2019-04-29 18:54:37.284208,probe,main,INFO,Main,statistics_tracker.cc,474,,TCP_ACTIVE_CONNECTIONS_PER_SECONDS - 32491459924 instances 8746409 samples
4,2019-04-29 18:54:37.284213,probe,main,INFO,Main,statistics_tracker.cc,474,,UDP_STARTED_FLOWS_PER_SECOND - 95949442 instances 8746409 samples


In [59]:
# check if there is any rows with empty log entry column
df[df.log_entry.isnull()]

Unnamed: 0,time,comp,handle,level,main,code,line,empty,log_entry


In [60]:
pd.options.display.max_colwidth = 100 # controls max display
df1 = df[df.level != 'INFO']
#df1.iloc[:,:9]
df1[['level','code','log_entry']]

Unnamed: 0,level,code,log_entry


## Count Unigrams

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['log_entry'], 20)
for word, freq in common_words:
    print(word, freq)

instances 11501
samples 11493
connections 11440
rollups 11438
processor 11438
basic 5720
time 5720
exported 5720
tcp 5720
send_to_stitcher 5720
marker 5720
send 5720
sample 5718
new 5718
bytes 3826
allocations 3779
ssl 1475
64 1410
packets 1278
total 1207


## Bigrams

In [62]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['log_entry'], 20)
for word, freq in common_words:
    print(word, freq)

rollups processor 11438
tcp connections 5720
send_to_stitcher send 5720
processor exported 5720
time marker 5720
basic connections 5720
send time 5720
processor new 5718
new sample 5718
total packets 1207
queue size 1136
size head 1016
statistics for 755
for handle 755
sessions this 714
collector sending 714
ssl sessions 714
heartbeat reported 714
this min 714
tds heartbeat 714


## Trigrams

In [63]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['log_entry'], 20)
for word, freq in common_words:
    print(word,"\t",freq)

send time marker 	 5720
send_to_stitcher send time 	 5720
rollups processor exported 	 5720
rollups processor new 	 5718
processor new sample 	 5718
queue size head 	 1016
statistics for handle 	 755
ssl sessions this 	 714
sessions this min 	 714
sending tds heartbeat 	 714
collector sending tds 	 714
tds heartbeat reported 	 714
heartbeat reported ssl 	 714
reported ssl sessions 	 714
ssl collector sending 	 714
bytes 64 allocations 	 705
64 bytes 64 	 705
tail total packets 	 568
head tail total 	 568
size head tail 	 568
