# Настройка среды локально

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('C://work/dev/python/progs/texts/sec_bert/')

In [3]:
from ruamel.yaml import YAML



conf = YAML().load(open('params.yaml'))

# Настройка среды в облаке

# Препроцессинг

In [4]:
import pandas as pd
import numpy as np
SEED = 0
np.random.seed(SEED)
import joblib
from sklearn.metrics import average_precision_score


In [5]:
import torch
torch.manual_seed(0)
from torch.utils.data import DataLoader

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.cuda.manual_seed_all(0)

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertConfig, AutoModel
from transformers import DataCollatorWithPadding
from transformers import RobertaTokenizer, RobertaModel


  from .autonotebook import tqdm as notebook_tqdm


## функции

In [7]:
from src.train_eval_model import metric_multi

In [9]:
from itertools import chain
from collections import defaultdict

def get_preds(model, ld):

    model.eval()
    res_d = defaultdict(list)

    with torch.no_grad():
        for batch in ld:
            pred = model(batch)
            sigm_preds = pred.sigmoid().cpu()
            res_d['target'].append(batch['target'].numpy())
            res_d['pred'].append(sigm_preds.numpy())

    res_d['target'] = list(chain(*res_d['target']))
    res_d['pred'] = list(chain(*res_d['pred']))

    return res_d

## переменные


In [10]:
use_old_mitre = False
target = 'tactic' # 'tech', 'subtech'

TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8

MAX_SEQ_LENGTH = 128

DROPOUT_RATIO = 0.3

MODE = 'cls' # pooler

LEARNING_RATE = 1e-5
EPOCH_NUM = 10



## загрузка датасета и формирование target

In [12]:
mlb = joblib.load(conf['prep_text']['mlb_fn'])
data = pd.read_csv(conf['feat_gen']['data_fn'])

data['target'] = data['target'].map(lambda x: eval(x))
data['labels'] = data['labels'].map(lambda x: eval(x))
tr_idx = data.query('split=="tr"').index
val_idx = data.query('split=="val"').index
ts_idx = data.query('split=="ts"').index

# Моделирование

## загрузка модели

In [14]:
checkpoint = "ehsanaghaei/SecureBERT_Plus"
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)
model = RobertaModel.from_pretrained(checkpoint, output_hidden_states=True)
tokenizer_opts = {'max_length':MAX_SEQ_LENGTH, 'return_tensors':"pt", 'padding':True, 'truncation':True, 'add_special_tokens':True}
bert_model= model

# checkpoint = 'allenai/scibert_scivocab_uncased'
# tokenizer = BertTokenizer.from_pretrained(checkpoint, max_length=512)
# model = BertForSequenceClassification.from_pretrained('scibert_multi_label_model')
# bert_model= model.bert
# tokenizer_opts = {'return_tensors':"pt", 'truncation':True,
#                   'padding':'max_length', 'max_length':MAX_SEQ_LENGTH}

# checkpoint = "jackaduma/SecBERT"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenizer_opts = {'max_length':MAX_SEQ_LENGTH, 'return_tensors':"pt", 'padding':True, 'truncation':True, 'add_special_tokens':True}
# config = BertConfig.from_pretrained(checkpoint, output_hidden_states=True)
# bert_model = AutoModel.from_pretrained(checkpoint, config=config).base_model

Some weights of RobertaModel were not initialized from the model checkpoint at ehsanaghaei/SecureBERT_Plus and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## подготовка данных

In [16]:
from torch.utils.data import Dataset

class TextDFDataset(Dataset):

    def __init__(self, df):

      self.df = df

    def __getitem__(self, idx):
      # import pdb;pdb.set_trace()
      tok_d = tokenizer(self.df.loc[self.df.index[idx], 'sentence'], **tokenizer_opts)

      return {**{k:v.flatten() for k,v in tok_d.items()}, **{'target':torch.tensor(self.df.loc[self.df.index[idx], 'target'], dtype=torch.float)}}

    def __len__(self):

      return self.df.shape[0]

tr_ds = TextDFDataset(data.query('split=="tr"').reset_index(drop=True))
val_ds = TextDFDataset(data.query('split=="val"').reset_index(drop=True))
ts_ds = TextDFDataset(data.query('split=="ts"').reset_index(drop=True))


In [17]:
tr_ld = DataLoader(tr_ds, batch_size = TRAIN_BATCH_SIZE, shuffle = True, collate_fn = DataCollatorWithPadding(tokenizer=tokenizer))
val_ld = DataLoader(val_ds, batch_size = VALID_BATCH_SIZE, shuffle = False, collate_fn = DataCollatorWithPadding(tokenizer=tokenizer))
ts_ld = DataLoader(ts_ds, batch_size = VALID_BATCH_SIZE, shuffle = False, collate_fn = DataCollatorWithPadding(tokenizer=tokenizer))

### проверка

In [18]:
batch = next(iter(val_ld))


## архитектура модели

In [20]:
len(mlb.classes_)

14

In [22]:
class TextModelClass(torch.nn.Module):

    def __init__(self, classnum, dropout_ratio):

        super().__init__()
        self.bert = bert_model
        self.lin = torch.nn.Linear(768, 768)
        self.drop_out = torch.nn.Dropout(dropout_ratio)
        self.lin_out = torch.nn.Linear(768, classnum)

    def forward(self, X):

        out = self.bert(**{k: v.to(DEVICE) for k, v in X.items() if k!='target'})
        if MODE=='cls':
            out = out.last_hidden_state[:,0,:]
        elif MODE=='pooler':
            out = out.pooler_output
        out = self.lin(out)
        out = torch.nn.ReLU()(out)
        out = self.drop_out(out)
        out = self.lin_out(out)

        return out

model = TextModelClass(classnum=len(mlb.classes_), dropout_ratio=DROPOUT_RATIO)
model = model.to(DEVICE)



In [23]:
# Заморозьте все слои
for param in model.bert.parameters():
    param.requires_grad = False

### проверка

out = model(batch)
out

## обучение

In [24]:
from tqdm.auto import tqdm

EPOCH_NUM = 5
loss_fn = torch.nn.BCEWithLogitsLoss()

# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.RMSprop(model.parameters(), lr=LEARNING_RATE)

val_iter_num = 1
refresh_cache_iter = 10
progress_bar = tqdm(range(EPOCH_NUM))

loss_d = {}

model = model.to(DEVICE)

for epoch in range(1, EPOCH_NUM+1):
    loss_tr_l = []
    model.train()
    tr_batch_num = len(tr_ld)
    tr_loss_epoch = 0
    for batch_tr in tr_ld:
        out = model(batch_tr)
        # import pdb;pdb.set_trace()
        optimizer.zero_grad()

        loss = loss_fn(out, batch_tr['target'].to(DEVICE))
        loss.backward()
        optimizer.step()
        tr_loss_epoch = tr_loss_epoch + loss.item()
        

    if epoch%val_iter_num==0:
        model.eval()
        val_batch_num = len(val_ld)
        val_loss_epoch = 0
        pr_auc = 0
        res_d = defaultdict(list)
        with torch.no_grad():
            for batch_val in val_ld:
                pred = model(batch_val)
                val_loss = loss_fn(pred, batch_val['target'].to(DEVICE))
                val_loss_epoch = val_loss_epoch+val_loss.item()
                sigm_preds = pred.sigmoid().cpu()

                pr_auc = pr_auc + metric_multi(batch_val['target'].numpy(), sigm_preds.numpy(), average_precision_score)[0]

                res_d['target'].append(batch_val['target'].numpy())
                res_d['pred'].append(sigm_preds.numpy())

            res_d['target'] = list(chain(*res_d['target']))
            res_d['pred'] = list(chain(*res_d['pred']))

    loss_d[epoch] = {'loss_tr':tr_loss_epoch/tr_batch_num,
                      'loss_val':val_loss_epoch/val_batch_num,
                    'pr_auc_batch':pr_auc/val_batch_num,
                     'pr_auc':metric_multi(np.array(res_d['target']), np.array(res_d['pred']), average_precision_score)[0]}

    progress_bar.update(1)

 20%|████████████████▌                                                                  | 1/5 [04:08<16:34, 248.54s/it]
KeyboardInterrupt



In [41]:
[it['pr_auc'] for it in loss_d.values()]

[0.3710388512426114,
 0.40038585810994426,
 0.4263454602383034,
 0.450738006405177,
 0.47223063070978405]

In [42]:
[it['loss_val'] for it in loss_d.values()]

[0.1306697061026898,
 0.12292649539904363,
 0.11735514902479105,
 0.11313511422023881,
 0.10940911666288951]

In [43]:
[it['loss_tr'] for it in loss_d.values()]

[0.1427033097231644,
 0.13402065215815584,
 0.12762271530379762,
 0.12275041699915964,
 0.11897184678581618]

In [None]:
0.62 pr_auc на val, надо на переобучение смотреть

- 130 эпох
- batch = 8
- seq_len = 128
- можно со скоростью поиграть еще

# Дотренировка

In [None]:
# !mkdir '/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models'

In [44]:
# torch.save(model, 'data/models/jackaduma_model130.pt')
torch.save(model, 'data/models/secbertplus7.pt')
# torch.save(model, '/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models/allenai_model12.pt')

In [None]:
# model= torch.load('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models/jackaduma_model15.pt')
model= torch.load('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models/allenai_model6.pt')

# Проверка предсказаний

In [77]:
res_val = get_preds(model, ld=val_ld)
res_tr = get_preds(model, ld=tr_ld)

In [78]:
res_ts = get_preds(model, ld=ts_ld)

In [80]:
fin_metric, res_l = metric_multi(np.array(res_val['target']), np.array(res_val['pred']), average_precision_score)
print(fin_metric)
res_val_df = pd.DataFrame({'qual':res_l, 'class':mlb.classes_})

fin_metric, res_l = metric_multi(np.array(res_tr['target']), np.array(res_tr['pred']), average_precision_score)
print(fin_metric)
res_tr_df = pd.DataFrame({'qual':res_l, 'class':mlb.classes_})

fin_metric, res_l = metric_multi(np.array(res_ts['target']), np.array(res_ts['pred']), average_precision_score)
print(fin_metric)
res_ts_df = pd.DataFrame({'qual':res_l, 'class':mlb.classes_})

0.6283845738053753
0.8246917459390393
0.6080092609464837


In [86]:
res_ts_df.query('qual<0.8')

Unnamed: 0,qual,class
0,0.688127,defense-evasion
1,0.643182,privilege-escalation
2,0.677441,execution
3,0.635038,persistence
4,0.789678,command-and-control
5,0.602379,collection
6,0.307826,lateral-movement
7,0.55149,credential-access
9,0.718708,resource-development
10,0.678109,reconnaissance


In [None]:
res_val_df.query('qual>0.4')['qual']

0     0.622477
1     0.511515
2     0.511755
3     0.453695
4     0.653399
5     0.459702
8     0.808540
12    0.427619
Name: qual, dtype: float64

In [None]:
df.drop(columns='target').explode('labels').pivot_table(index = ['split'], columns = ['labels'], values='labels', aggfunc='count' )

labels,collection,command-and-control,credential-access,defense-evasion,discovery,execution,exfiltration,impact,initial-access,lateral-movement,persistence,privilege-escalation,reconnaissance,resource-development
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
tr,1253,2559,969,5254,2964,2517,370,244,793,474,1719,2135,140,368
ts,52,106,41,218,122,105,15,10,33,20,71,88,6,15
val,52,108,40,216,125,109,16,10,33,20,73,91,6,15
