In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import random
from base64 import b64decode
import json
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
from urllib.parse import unquote
import time

In [3]:
data_pred = []
with open('./db_notag_predict_noparse.txt',encoding='utf-8') as f:
    for line in f:
        line = json.loads(line)
        line['label_tag'] = 'ATTACK' if len(line['tags'])>=2 else 'NORMAL'
        line['label_extras'] = 'ATTACK' if len(line['extras'])>=2 else 'NORMAL'
        line['label'] = 'ATTACK' if len(line['extras'])>=2 or len(line['tags'])>=2 else 'NORMAL'
#         line.pop('tags')
        data_pred.append(unquote(b64decode(line['raw']).decode('utf-8','ignore')))
        if len(data_pred)==300000:
            break

In [4]:
data_pred = [{'ip':12,'text':'abc'},{'ip':23,'text':'sdf'},{'ip':34,'text':'dfd'}]

In [9]:
import re
def count_ngram(document):
#         document = self._white_spaces.sub(' ',document).lower()
#         word_counter = {}
#         for i in range(len(document)-ngram):
#             w = document[i:i+ngram]
#             if w in word_counter:
#                 word_counter[w] += 1
#             else:
#                 word_counter[w] = 1
#         w_end = document[-ngram:]
#         return word_counter,w_end
        return {'abc':1, 'bcd':2, 'def':3}

class Vectorizer:
    _white_spaces = re.compile(r"\s\s+")
    def __init__(self,ngram_range = (1,1), tokenize_func = count_ngram):
        self.min_gram,self.max_gram = ngram_range
        self.tokenize_func = tokenize_func
        self.vocab = {}

    def group_by_ip(self,documents):
        hosts = {}
        for document in documents:
            ip = document['ip']
            tokens_counter = self.tokenize_func(document['text'])
            if ip not in hosts:
                hosts[ip] = {}
            for token, count in tokens_counter.items():
                if token in self.vocab:
                    if token in hosts[ip]:
                        hosts[ip][token][0] += 1
                        hosts[ip][token][1] += count
                    else:
                        hosts[ip][token] = [1,count]
        return hosts
    
    def get_stat_vocab(self,documents):
        hosts = self.group_by_ip(documents)
        stat_vocab = {}
        for ip,tokens in hosts.items():
            for token,count in tokens.items():
                if token not in stat_vocab:
                    stat_vocab[token] = [0,0]
                stat_vocab[token][0] += 1
                stat_vocab[token][1] += count[0]
        return stat_vocab
    
    def build_vocab(self,documents):
        vocab = {}
        for document in documents:
            tokens_counter = self.tokenize_func(document)
            for token in tokens_counter:
                if token in vocab:
                    vocab[token] += 1
                else:
                    vocab[token] = 1
        self.vocab = vocab
        self.stat_vocab = self.get_stat_vocab(documents)
#         return self.vocab, self.stat_vocab

In [10]:
print(data_pred[:300])

[{'ip': 12, 'text': 'abc'}, {'ip': 23, 'text': 'sdf'}, {'ip': 34, 'text': 'dfd'}]


In [11]:
vectorizer = Vectorizer(ngram_range = (2,4))

In [12]:
vectorizer.build_vocab(documents=data_pred)

In [13]:
vectorizer.vocab

{'abc': 3, 'bcd': 3, 'def': 3}

In [14]:
vectorizer.stat_vocab

{'abc': [3, 3], 'bcd': [3, 3], 'def': [3, 3]}