## Đánh giá mức độ đồng thuận của dữ liệu
* Input: label và raw_text
* Output: chỉ số đồng thuận
### Thực hiện
* Xây dụng bộ vocab từ class positive:
 * Sử dụng ngram: 2->4
* Kí hiệu:
 * $C_{w_j}^p$: số lượng positive samples chứa pattern $w_j$
 * $C_{w_j}$: số lượng samples chứa pattern $w_j$
 * $R_i$ là 1 positive sample
 * $w_k^j$ là pattern thứ $k$ trong sample $j$
* Score cho từng pattern $w_j$ trong vocab: $$ S_{w_j} = \frac{C_{w_j}^p}{C_{w_j}}$$
* Score sample $i$ : $$S_i = \sum_{j = 0}^{N} c_j S_{w_j} $$
* Độ đồng thuận: $$ S = \frac{1}{N_r} \sum_{i} S_i$$

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import random
from base64 import b64decode
import json
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
from urllib.parse import unquote
import time

In [4]:
from pymongo import MongoClient
class DataBase:
    def __init__(self,connect_string,config):
        self.config = config
        try:
            self.client = MongoClient(connect_string)
        except Exception as e:
            logger.exception(e)
            raise e
        self.iml = self.client[config['db']]
        self.collection = self.iml[config['collection']]
        # self.model = self.iml[config['model']]

    def sample(self,k,from_time = None,to_time = None):
        for i in self.collection.aggregate([{ "$sample": {"size": k}}] ):
            yield i

    def __len__(self):
        return self.collection.count({})

    def __iter__(self):
        for i in self.collection.find({}):
            yield i

In [5]:
import re
p = re.compile('\d')
def normalize(s):
    return p.sub('0',s)
def get_score(v):
    if len(v)==0:
        return 0
    elif len(v)==1:
        return 0.7*np.max(v)
    elif len(v)==2:
        return 0.8*(np.mean(v)+np.max(v))/2
    elif len(v)==3:
        return 0.9*(np.mean(v)+np.max(v))/2
    else:
        return (np.mean(v)+np.max(v))/2

def evaluate(data,labels,batch = 10000,alpha = 0.7):
    attack = [d for i,d in enumerate(data) if labels[i]=='ATTACK']
    if len(attack)==0:
        return 0
    normal = [d for i,d in enumerate(data) if labels[i]=='NORMAL']
    count_vectorizer = CountVectorizer(analyzer="char_wb",binary=True, ngram_range=(2,4),min_df = max(2.0/len(attack),0.05),max_df=1.0)
    count_vectorizer.fit(attack)
    
    X_attack = count_vectorizer.transform(attack)
    count_attack = np.sum(X_attack,axis = 0)/len(attack)
    count_normal = np.array([0]*len(count_vectorizer.vocabulary_))
    for i in range(0,len(normal),batch):
        X_normal = count_vectorizer.transform(normal[i:i+batch])
        count_normal = count_normal + np.sum(X_normal,axis = 0)
#     print(count_attack)
#     print(count_normal)
    count_normal = count_normal/len(normal)
    score = count_attack/(count_attack+count_normal)
#     print(score.shape)
    count_normal = count_normal.tolist()[0]
    score = score.tolist()[0]
#     print(score)
    score = {v:score[v] for k,v in count_vectorizer.vocabulary_.items() if score[v] > alpha and count_normal[v]<0.5}
#     X_attack = X_attack.toarray().tolist()
    X_attack = [get_score([score[i] for i,v in enumerate(r.toarray().tolist()[0]) if i in score and v>0 ]) for r in X_attack]
#     score = [get_score(v) for v in X_attack]
    print(np.mean(X_attack))
    return np.mean(X_attack)

def evaluate_model(attack, normal, batch=50000, alpha=0.7,min_sample = 0.1,normal_count = True):
#     attack = [normalize(d) for i, d in enumerate(data) if labels[i] == 'ATTACK']
#     normal = [normalize(d) for i, d in enumerate(data) if labels[i] == 'NORMAL']
    if len(attack) == 0:
        return 0
    
    print(len(normal),len(attack))
    if len(normal)<len(attack):
        count_vectorizer = CountVectorizer(analyzer="char_wb", binary=True, ngram_range=(2, 4),
                                           min_df=0.5, max_df=1)
        count_vectorizer.fit(normal)
        stop_word = set(count_vectorizer.vocabulary_.keys())

        count_vectorizer = CountVectorizer( analyzer="char_wb", binary=True, ngram_range=(2, 4),
                                           min_df=max(2.0 / len(attack), min_sample), max_df=1.0)
        count_vectorizer.fit(attack)
        vocab = count_vectorizer.vocabulary_.keys() - stop_word
    else:
        print('build attack')
        count_vectorizer = CountVectorizer(analyzer="char_wb", binary=True, ngram_range=(2, 4),
                                           min_df=max(2.0 / len(attack), min_sample), max_df=1.0)
        count_vectorizer.fit(attack)
        vocab = count_vectorizer.vocabulary_.keys()

    vocab = list(vocab)
    print(len(vocab))
    count_vectorizer = CountVectorizer(analyzer="char_wb", binary=True, ngram_range=(2, 4),
                                           min_df=max(2.0 / len(attack), min_sample), max_df=1.0,vocabulary=vocab).fit({})
    print(len(count_vectorizer.vocabulary_))
    count_attack = np.array([0] * len(count_vectorizer.vocabulary_))
    for i in range(0, len(attack), batch):
        X_attack = count_vectorizer.transform(attack[i:i + batch])
        count_attack = count_attack + np.sum(X_attack, axis=0)
    # X_attack = count_vectorizer.transform(attack)

    count_normal = np.array([0] * len(count_vectorizer.vocabulary_))
    for i in range(0, len(normal), batch):
        X_normal = count_vectorizer.transform(normal[i:i + batch])
        count_normal = count_normal + np.sum(X_normal, axis=0)
    # print(count_attack)
    #     print(count_normal)
    if normal_count:
        count_attack = count_attack / len(attack)
        count_normal = count_normal/len(normal)
    score = count_attack / (count_attack + count_normal)
    #     print(score.shape)
    count_attack = count_attack.tolist()[0]
    count_normal = count_normal.tolist()[0]
    score = score.tolist()[0]
    #     print(score)
    score = {v: score[v] for k, v in count_vectorizer.vocabulary_.items() if score[v] > alpha }
    #     X_attack = X_attack.toarray().tolist()
    X_attack = [get_score([score[i] for i, v in enumerate(r.toarray().tolist()[0]) if i in score and v > 0]) for r in
                X_attack]
    #     score = [get_score(v) for v in X_attack]
    score_normal = []
#     for i in range(0, len(normal), batch):
#         X_normal = count_vectorizer.transform(normal[i:i + batch])
#         score_normal += [get_score([score[i] for i, v in enumerate(r.toarray().tolist()[0]) if i in score and v > 0 and count_attack[i]>0.2 and count_normal[i]*len(normal) + count_attack[i]*len(attack)<0.02*(len(normal)+len(attack))]) for r in
#                 X_normal]
    precision = np.mean(X_attack)
    recall = np.sum(X_attack)/(np.sum(X_attack) + np.sum(score_normal))
    f1 = 2*precision*recall/(precision+recall + 1e-10)
    print('precision',precision)
#     print(np.mean(score_normal),np.sum(score_normal))
    print('recall',recall)
    print('f1',f1)
    return precision,recall,f1


In [6]:
data_pred = []
with open('./db_notag_predict_noparse.txt',encoding='utf-8') as f:
    for line in f:
        line = json.loads(line)
        line['label_tag'] = 'ATTACK' if len(line['tags'])>=2 else 'NORMAL'
        line['label_extras'] = 'ATTACK' if len(line['extras'])>=2 else 'NORMAL'
        line['label'] = 'ATTACK' if len(line['extras'])>=2 or len(line['tags'])>=2 else 'NORMAL'
#         line.pop('tags')
        data_pred.append(unquote(b64decode(line['raw']).decode('utf-8','ignore')))
        if len(data_pred)==300000:
            break

In [7]:
class Ngrams:
    _white_spaces = re.compile(r"\s\s+")
    def __init__(self,ngram_range = (1,1)):
        self.ngram_range = ngram_range
    def char_ngrams(self, text_document):
        """Tokenize text_document into a sequence of character n-grams"""
        # normalize white spaces
        text_document = self._white_spaces.sub(" ", text_document)

        text_len = len(text_document)
        min_n, max_n = self.ngram_range
        if min_n == 1:
            # no need to do any slicing for unigrams
            # iterate through the string
            ngrams = list(text_document)
            min_n += 1
        else:
            ngrams = []

        # bind method outside of loop to reduce overhead
        ngrams_append = ngrams.append

        for n in range(min_n, min(max_n + 1, text_len + 1)):
#             ngrams += [text_document[i: i + n] for i in range(text_len - n + 1)]
            for i in range(text_len - n + 1):
                ngrams_append(text_document[i: i + n])
        return ngrams

In [8]:
class CountNgrams:
    _white_spaces = re.compile(r"\s\s+")
    def __init__(self,ngram_range = (1,1)):
        self.ngram_range = ngram_range
    def char_ngrams(self, text_document):
        """Tokenize text_document into a sequence of character n-grams"""
        # normalize white spaces
        text_document = self._white_spaces.sub(" ", text_document)

        text_len = len(text_document)
        min_n, max_n = self.ngram_range
        if min_n == 1:
            # no need to do any slicing for unigrams
            # iterate through the string
            ngrams = list(text_document)
            min_n += 1
        else:
            ngrams = []

        # bind method outside of loop to reduce overhead
        ngrams_append = ngrams.append
        vocab = {}
#         for n in range(min(max_n, text_len),min(max_n, text_len),-1):
        n = min(max_n, text_len)
        for i in range(text_len - n+1):
            vocab[text_document[i: i + n]] = 1

        before = vocab
        curr = {}
        for _ in range(n - min_n):
            for w in before:
                curr[w[:-1]] = 1
                curr[w[1:]] = 1
            vocab.update(curr)
            before = curr
            curr = {}
#         for n in range(min_n, min(max_n + 1, text_len + 1)):
# #             ngrams += [text_document[i: i + n] for i in range(text_len - n + 1)]
#             for i in range(text_len - n + 1):
#                 ngrams_append(text_document[i: i + n])
        return vocab

In [7]:
class Vectorizer:
    _white_spaces = re.compile(r"\s\s+")
    def __init__(self,ngram_range = (1,1),min_df = 0.0,max_df = 1.0):
        self.min_gram,self.max_gram = ngram_range
        self.min_df = min_df
        self.max_df = max_df
        self.vocab = {}
    def count_ngram(self,document,ngram):
        document = self._white_spaces.sub(' ',document).lower()
        word_counter = {}
        for i in range(len(document)-ngram):
            w = document[i:i+ngram]
            if w in word_counter:
                word_counter[w] += 1
            else:
                word_counter[w] = 1
        w_end = document[-ngram:]
        return word_counter,w_end
    
    def get_ip(self,raw):
        # IP: 27.68.241.28
        a = re.match('.*IP:\s*(?P<ip>\d+(?:.\d+){3,5}).*',raw)
        if a:
            return a.group('ip')
        else:
            return ''

    def group_by_ip(self,documents):
        hosts = {}
        for document in documents:
            ip = self.get_ip(document)
            word_counter, _ = self.count_ngram(document,ngram=self.max_gram)
            if ip not in hosts:
                hosts[ip] = {}
            for token in word_counter:
                if token in self.vocab:
                    if token in hosts[ip]:
                        hosts[ip][token] += 1
                    else: 
                        hosts[ip][token] = 1
        return hosts
    def get_stat_vocab(self,documents):
        hosts = self.group_by_ip(documents)
        stat_vocab = {}
        for ip,tokens in hosts.items():
            for token,count in tokens.items():
                if token not in stat_vocab:
                    stat_vocab[token] = [0,0]
                stat_vocab[token][0] += 1
                stat_vocab[token][1] += count
        return stat_vocab
    def build_vocab(self,documents):
        self.min_count = int(self.min_df*len(documents))
        self.max_count = int(self.max_df*len(documents))
        vocab = {}
        word_ends = []
        docs = []
        for document in documents:
            word_counter,w_end = self.count_ngram(document,self.max_gram)
            docs.append((word_counter,w_end))
            word_ends.append(w_end)
            for w in word_counter:
                if w in vocab:
                    vocab[w] += 1
                else:
                    vocab[w] = 1
        
        self.vocab = {}
        for n in range(self.max_gram-self.min_gram+1):
            n1_gram = {}
            for w,v in vocab.items():
                if self.min_count <= v < self.max_count:
                    self.vocab[w] = v
                elif v < self.min_count:
                    w = w[:-1]
                    if w in n1_gram:
                        n1_gram[w] += v
                    else:
                        n1_gram[w] = v
            n1_ends = []
            for w in word_ends:
                w = w[1:]
                if w in n1_gram:
                    n1_gram[w] += 1
                else:
                    n1_gram[w] = 1
                n1_ends.append(w)
            
            vocab = n1_gram
            word_ends = n1_ends
        self.stat_vocab = self.get_stat_vocab(documents)
        return self.vocab, self.stat_vocab

In [None]:
vectorizer = Vectorizer(ngram_range = (2,4),min_df = 0.001,max_df = 1)

In [24]:
s = time.time()
vocab,stat_vocab = vectorizer.build_vocab(data_pred[:100000])
print(time.time()-s)

172.61186861991882


In [20]:
hosts = vectorizer.group_by_ip(data_pred[0:20000])
print(len(hosts))

758


In [23]:
print((vectorizer.stat_vocab))

{'1fa8': [3, 147], 'ken]': [14, 340], 'g/ h': [1, 61], '8*39': [2, 70], 'n=a6': [5, 77], ': 9.': [308, 779], 'inex': [1, 59], '3; s': [33, 186], 'h-si': [20, 58], 'slin': [1, 60], '.225': [14, 76], ':10 ': [96, 940], '8*68': [1, 50], '1 li': [57, 121], '8642': [4, 65], '1755': [6, 57], 'id.c': [56, 112], '7391': [12, 53], '34\\n': [83, 332], '.21.': [15, 58], 'bv(1': [47, 106], '7.16': [17, 133], 'nt 1': [44, 15738], '9:12': [17, 110], '0.24': [33, 128], '/42.': [11, 95], 'ip_g': [30, 51], '4:18': [17, 98], 'avix': [23, 910], '8 mo': [55, 155], '?aut': [101, 110], '9396': [4, 316], '3884': [5, 50], '-req': [908, 4620], '7216': [3, 83], '_com': [79, 9649], '/all': [2, 140], '1566': [4, 56], '; f3': [10, 55], '*123': [4, 245], 'on=d': [52, 687], '5759': [7, 52], 'indo': [162, 21321], '52:4': [80, 957], 'lenc': [167, 1077], 'de36': [2, 69], '=&_=': [30, 160], 'd1e\\': [2, 68], 'h/10': [3, 372], '-cha': [43, 113], '-379': [393, 723], 'sim?': [114, 40545], '*/ss': [1, 82], '3090': [5, 51], 

In [25]:
import time

In [None]:
ngram = CountNgrams((2,5))

In [None]:
char_ngrams = ngram.char_ngrams

In [None]:
s = time.time()
vocab = {}
# vocab_add = vocab.add
for d in data_pred:
    for w in ngram.char_ngrams(d):
        vocab[w] = 1
# vocab = set(vocab)
print(time.time()-s)
len(vocab)

In [None]:
ngram = Ngrams((5,5))
s = time.time()
vocab = {}
# vocab_add = vocab.add
for d in pred_label['ATTACK']:
    for w in ngram.char_ngrams(d):
        vocab[w] = 1
before = vocab
curr = {}
for _ in range(3):
    for w in before:
        curr[w[:-1]] = 1
        curr[w[1:]] = 1
    vocab.update(curr)
    before = curr
    curr = {}
# vocab = set(vocab)
print(time.time()-s)
len(vocab)

In [None]:
len(vocab)

In [6]:
def build_vocab_evaluate(attack, normal):
    if len(attack) == 0:
        return 0
    
    attack_ip = group_by_ip(attack)
    normal_ip = group_by_ip(normal)
    
#     attack_data = ['   '.join(v) for v in attack_ip.values()]
#     normal_data = ['   '.join(v) for v in normal_ip.values()]
    count_vectorizer = CountVectorizer(analyzer="char_wb", binary=True, ngram_range=(2, 4),
                                       min_df=2, max_df=1.0)
    
    count_vectorizer.fit(attack)
    v_attack = [count_vectorizer.transform(v).sum(axis=0) for v in attack_ip.values()]
    v_attack[0].fill(1)
    print(v_attack[0])

build_vocab_evaluate(pred_label['ATTACK'],pred_label['NORMAL'])

NameError: name 'pred_label' is not defined

In [None]:
def get_ip(raw):
    # IP: 27.68.241.28
    a = re.match('.*IP:\s*(?P<ip>\d+(?:.\d+){3,5}).*',raw)
    if a:
        return a.group('ip')
    else:
        return ''

def group_by_ip(raws):
    hosts = {}
    for r in raws:
        ip = get_ip(r)
        if ip in hosts:
            hosts[ip].append(r)
        else:
            hosts[ip] = [r]
    return hosts
def evaluate_model_v2(attack, normal, batch=50000, alpha=0.7,min_sample = 0.1,normal_count = True):
#     attack = [normalize(d) for i, d in enumerate(data) if labels[i] == 'ATTACK']
#     normal = [normalize(d) for i, d in enumerate(data) if labels[i] == 'NORMAL']
    if len(attack) == 0:
        return 0
    
    attack_ip = group_by_ip(attack)
    normal_ip = group_by_ip(normal)
    
    attack_data = ['   '.join(v) for v in attack_ip.values()]
    normal_data = ['   '.join(v) for v in normal_ip.values()]
    
    print(len(attack_ip),len(normal_ip))
    
    print(len(normal),len(attack))
#     if len(normal)<len(attack):
#         count_vectorizer = CountVectorizer(analyzer="char_wb", binary=True, ngram_range=(2, 4),
#                                            min_df=0.5, max_df=1)
#         count_vectorizer.fit(normal)
#         stop_word = set(count_vectorizer.vocabulary_.keys())

#         count_vectorizer = CountVectorizer( analyzer="char_wb", binary=True, ngram_range=(2, 4),
#                                            min_df=max(2.0 / len(attack), min_sample), max_df=1.0)
#         count_vectorizer.fit(attack)
#         vocab = count_vectorizer.vocabulary_.keys() - stop_word
#     else:
    print('build attack')
    count_vectorizer = CountVectorizer(analyzer="char_wb", binary=True, ngram_range=(2, 4),
                                       min_df=max(2.0 / len(attack), min_sample), max_df=1.0)
    count_vectorizer.fit(attack)
    vocab = count_vectorizer.vocabulary_.keys()
    print(len(vocab))
    
    stop_word = CountVectorizer(analyzer="char_wb", binary=True, ngram_range=(2, 4),
                                           min_df=0.02, max_df=1.0).fit(normal_data + attack_data).vocabulary_.keys()
    vocab = vocab - stop_word
    
    vocab = list(vocab)
    print(len(vocab))
    count_vectorizer = CountVectorizer(analyzer="char_wb", binary=True, ngram_range=(2, 4),
                                           min_df=max(2.0 / len(attack), min_sample), max_df=1.0,vocabulary=vocab).fit({})
    print(len(count_vectorizer.vocabulary_))
    count_attack = np.array([0] * len(count_vectorizer.vocabulary_))
    for i in range(0, len(attack), batch):
        X_attack = count_vectorizer.transform(attack[i:i + batch])
        count_attack = count_attack + np.sum(X_attack, axis=0)
    # X_attack = count_vectorizer.transform(attack)

    count_normal = np.array([0] * len(count_vectorizer.vocabulary_))
    for i in range(0, len(normal), batch):
        X_normal = count_vectorizer.transform(normal[i:i + batch])
        count_normal = count_normal + np.sum(X_normal, axis=0)
    # print(count_attack)
    #     print(count_normal)
    if normal_count:
        count_attack = count_attack / len(attack)
        count_normal = count_normal/len(normal)
    score = count_attack / (count_attack + count_normal)
    #     print(score.shape)
    count_attack = count_attack.tolist()[0]
    count_normal = count_normal.tolist()[0]
    score = score.tolist()[0]
    #     print(score)
    score = {v: score[v] for k, v in count_vectorizer.vocabulary_.items() if score[v] > alpha }
    #     X_attack = X_attack.toarray().tolist()
    X_attack = [get_score([score[i] for i, v in enumerate(r.toarray().tolist()[0]) if i in score and v > 0]) for r in
                X_attack]
    #     score = [get_score(v) for v in X_attack]
    score_normal = []
#     for i in range(0, len(normal), batch):
#         X_normal = count_vectorizer.transform(normal[i:i + batch])
#         score_normal += [get_score([score[i] for i, v in enumerate(r.toarray().tolist()[0]) if i in score and v > 0 and count_attack[i]>0.2 and count_normal[i]*len(normal) + count_attack[i]*len(attack)<0.02*(len(normal)+len(attack))]) for r in
#                 X_normal]
    precision = np.mean(X_attack)
    recall = np.sum(X_attack)/(np.sum(X_attack) + np.sum(score_normal))
    f1 = 2*precision*recall/(precision+recall + 1e-10)
    print('precision',precision)
#     print(np.mean(score_normal),np.sum(score_normal))
    print('recall',recall)
    print('f1',f1)
    return precision,recall,f1


In [None]:
group_by_ip(raws=data_pred[0:2000])['128.199.182.50']

In [None]:
db_rule = DataBase("mongodb://admin:fireinthehole@127.0.0.1:27017", {"db": "waf_dataset", "collection": "features"})
db_notag = DataBase("mongodb://admin:fireinthehole@127.0.0.1:27017", {"db": "waf_dataset", "collection": "features_notag"})
db_newdata = DataBase("mongodb://admin:fireinthehole@127.0.0.1:27017", {"db": "waf_dataset", "collection": "shop_viettel_vn_20190408"})

In [None]:
import base64
import hashlib

set_md5 = set()
with open('db_newdata.txt','w',encoding='utf-8') as f:
    for d in db_newdata:
#         print(d)
        data = base64.b64decode(d["raw"])
        data_md5 = hashlib.md5(data).hexdigest()
        if len(data) != 0 and data_md5 not in set_md5:
            set_md5.add(data_md5)
            d = {
                'raw':d['raw'],
                'request_id':d['request_id'],
                'tags':d['tags'],
                'extras':d['extras']
            }
            f.write("{0}\n".format(json.dumps(d,ensure_ascii= False)))

In [None]:
data_normal = []
data_attack = []
# c_attack = 0
for i in db_rule:
#     print(i)
    if random.random()<0.8:
        continue
    raw = b64decode(i['raw']).decode('utf-8','ignore')
    if len(i['extras'])>=2:
        data_attack.append(raw)
    else:
        data_normal.append(raw)
#     labels.append('ATTACK' if len(i['extras'])>=2 else 'NORMAL')
    if len(data_attack) + len(data_normal)==100000:
        break
print(len(data_attack))

In [None]:
def select_sample(d,size=1000):
    if size==0:
        return [],d
    if size == len(d):
        return d,[]
    return train_test_split(d,train_size=size/len(d))

In [None]:
def test_evaluate(M1_,M2_,N_,k=5):
    m2tom1 = []
    for c in range(0,6):
        m2a,m2b = select_sample(M2_,len(M2_)*c/5)
        pre,recall,f1 = evaluate_model(M1_+m2a,N_ + m2b,alpha=0.7)
        m2tom1.append({'rate':c/5,'precision':pre,'recall':recall,'f1':f1})
        
    ntom1 = []
    for c in range(0,6):
        na,nb = select_sample(N_,len(M2_)*c/5)
        pre,recall,f1 = evaluate_model(M1_+na,nb+M2_,alpha=0.7)
        ntom1.append({'rate':c/5,'precision':pre,'recall':recall,'f1':f1})
    return m2tom1,ntom1

In [None]:
M3,M7 = train_test_split(data_attack,train_size=0.3)
s1,s2 = test_evaluate(M3,M7,N)

In [None]:
pd.DataFrame(s2).head(20)

In [None]:
M8,M2 = train_test_split(data_attack,train_size=0.5)
N8,N2 = train_test_split(data_normal,train_size=0.8)
N = N8 + N2

In [None]:

evaluate_model(M8,N + M2,alpha=0.7)

In [None]:
print(len(N),len(M2),len(M8))

In [None]:
# Add M2 to M8
score_m2_to_m8 = []
N = N8 + N2
for c in range(0,6):
    m2a,m2b = select_sample(M2,len(M2)*c/5)
    pre,recall,f1 = evaluate_model(M8+m2a,N + m2b,alpha=0.7)
    score_m2_to_m8.append({'rate':c/5,'precision':pre,'recall':recall,'f1':f1})
print(pd.DataFrame(score_m2_to_m8).head(20))

In [None]:
fig = plt.figure()
ax = plt.axes()

ax.plot([a for a,_ in score_m2_to_m8], [a for _,a in score_m2_to_m8])

In [None]:
# Add M2 to M8
score_n_to_m8 = []
for c in range(0,6):
    na,nb = select_sample(N,len(M2)*c/5)
    score_n_to_m8.append((c/5,evaluate_model(M8+na,nb+M2,alpha=0.7)))

In [None]:

pd.DataFrame([{'rate':rate,'precision':pre,'recall':recall,'f1':f1} for rate,(pre,recall,f1) in s2]).head(20)

In [None]:
fig = plt.figure()
ax = plt.axes()

ax.plot([a for a,_ in score_n_to_m8], [a for _,a in score_n_to_m8])

In [None]:
data_notag = []
labels_notag = []
c_attack = 0
for i in db_notag:
#     print(i)
    if random.random()<0.8:
        continue
    data_notag.append(b64decode(i['raw']))
    if len(i['extras'])>=2:
        c_attack += 1
    labels_notag.append('ATTACK' if len(i['extras'])>=2 else 'NORMAL')
    if len(data_notag)==100000:
        break
print(c_attack)

In [None]:
vocab = evaluate_model(data,labels,alpha = 0.7)

In [None]:
fig = plt.figure()
ax = plt.axes()

ax.plot(rate_noise, scores)

## Evaluate on db ruletag

In [None]:
counter = {}
for x in data_pred:
    l = (x['label_tag'],x['model_predict'])
    if l in counter:
        counter[l].append(x)
    else:
        counter[l] = [x]
for k,v in counter.items():
    print(k,len(v))
pickle.dump(counter,open('/home/dev/linhpn/confusion_matrix_notag_tag_noparse.pkl','wb'))

In [None]:
counter = {}
for x in data_pred:
    l = (x['label_extras'],x['model_predict'])
    if l in counter:
        counter[l].append(x)
    else:
        counter[l] = [x]
for k,v in counter.items():
    print(k,len(v))
pickle.dump(counter,open('/home/dev/linhpn/confusion_matrix_notag_extras_noparse.pkl','wb'))

In [None]:
for x in counter[('NORMAL', 'ATTACK')][:100]:
    print(unquote(b64decode(x['raw']).decode('utf-8','ignore')))

In [None]:
extras_label = {}
for x in data_pred:
    raw = b64decode(x['raw']).decode('utf-8','ignore')
    if x['label'] in extras_label:
        extras_label[x['label']].append(raw)
#     break
    else:
        extras_label[x['label']] = [raw]

In [None]:
pred_label = {}
for x in data_pred:
    raw = unquote(b64decode(x['raw']).decode('utf-8','ignore'))
    if x['model_predict'] in pred_label:
        pred_label[x['model_predict']].append(raw)
#     break
    else:
        pred_label[x['model_predict']] = [raw]
# pre,recall,f1 = evaluate_model_v2(pred_label['ATTACK'],pred_label['NORMAL'])
# print(pre,recall,f1)

In [None]:
pre,recal,f1 = evaluate_model(extras_label['ATTACK'],extras_label['NORMAL'])
print(pre,recall,f1)

In [None]:
pickle.dump(counter,open('counter.pkl','wb'))

In [None]:
old_matrix = pickle.load(open('/home/dev/linhpn/confusion_matrix_07.pkl','rb'))

In [None]:
old_pred_att = set()
for (l,p),d in old_matrix.items():
    if p=='ATTACK':
        for c in d:
            old_pred_att.add(c['request_id'])

In [None]:
len(old_pred_att)

In [None]:
for x in data_pred:
    if x['request_id'] in old_pred_att:
        x['model_old'] = 'ATTACK'
    else:
        x['model_old'] = 'NORMAL'

In [None]:
counter_old = {}
for x in data_pred:
    l = (x['label_extras'],x['model_old'])
    if l in counter_old:
        counter_old[l].append(x)
    else:
        counter_old[l] = [x]

In [None]:
for k,v in counter_old.items():
    print(k,len(v))

In [None]:
pickle.dump(counter_old,open('compare_extras_model.pkl','wb'))

In [None]:
from urllib.parse import unquote
unquote('%22%6F%6E%6D%6F%75%73%65%6F%76%65%72%3D%27%63%73%53%32%28%39%31%32%36%29%27%62%61%64%3D%22')

In [7]:
for k,v in counter_old.items():
    print(k)
    for x in random.choices(v,k=5):
        print(x['request_id'])
#         print(unquote(b64decode(x['raw']).decode('utf-8','ignore')))

NameError: name 'counter_old' is not defined

In [None]:
tag_label = {'ATTACK':[],'NORMAL':[]}
for x in data_pred:
    tag_label[x['label_tag']].append(b64decode(x['raw']).decode('utf-8','ignore'))
score_old = evaluate_model(tag_label['ATTACK'],tag_label['NORMAL'])

In [None]:
model_label = {'ATTACK':[],'NORMAL':[]}
for x in data_pred:
    model_label[x['model_old']].append(b64decode(x['raw']).decode('utf-8','ignore'))

In [None]:
score_old = evaluate_model(model_label['ATTACK'],model_label['NORMAL'])

In [None]:
pickle.dump(counter_old,open('counter_old.pkl','wb'))

In [None]:
for x in data_pred:
    if x['label'] and x['']

In [None]:
c = 0
for x in data_pred[:10000]:
    if  x['model_predict']=='ATTACK' and '../' in b64decode(x['raw']).decode('utf-8','ignore'):
#         print(x['request_id'])
        print(b64decode(x['raw']).decode('utf-8','ignore'))
        c+=1
print(c)
        

In [None]:
c = pickle.load(open('/home/dev/linhpn/tfidf.pkl','rb'))
c.vocabulary_.keys()

## Evaluate db notag

In [None]:
notag_pred = []
with open('/home/dev/linhpn/db_notag_predict.txt',encoding='utf-8') as f:
    for line in f:
        line = json.loads(line)
        line['label_tag'] = 'ATTACK' if len(line['tags'])>=2 else 'NORMAL'
        line['label_extras'] = 'ATTACK' if len(line['extras'])>=2 else 'NORMAL'
        line['label'] = 'ATTACK' if len(line['extras'])>=2 or len(line['tags'])>=2 else 'NORMAL'
#         line.pop('tags')
        notag_pred.append(line)

In [None]:
counter_notag = {}
for x in notag_pred:
    l = (x['label_extras'],x['model_predict'])
    if l in counter_notag:
        counter_notag[l].append(x)
    else:
        counter_notag[l] = [x]
for k,v in counter_notag.items():
    print(k,len(v))