# Настройка среды локально

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('C://work/dev/python/progs/texts/sec_bert/')

In [3]:
from ruamel.yaml import YAML



conf = YAML().load(open('params.yaml'))

conf_dop = YAML().load(open('dvc_pipes/train_bert_emb/params_bert.yaml'))

In [6]:
mitre_attack_df.shape

(15149, 5)

# Настройка среды в облаке

# Препроцессинг

In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import average_precision_score
from src.funcs import set_seed

set_seed(conf['seed'])

In [5]:
import torch
from torch.utils.data import DataLoader

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [6]:
from torch.utils.data import DataLoader

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertConfig, AutoModel
from transformers import DataCollatorWithPadding
from transformers import RobertaTokenizer, RobertaModel



  from .autonotebook import tqdm as notebook_tqdm


## функции

In [23]:
from src.train_eval_model import metric_multi
from sklearn.metrics import average_precision_score

from itertools import chain
from collections import defaultdict

from src.funcs import get_preds


## переменные


In [9]:
TRAIN_BATCH_SIZE = conf_dop['nn']['batch_size']
VALID_BATCH_SIZE = conf_dop['nn']['batch_size']

MAX_SEQ_LENGTH = conf_dop['nn']['maxlen']

DROPOUT_RATIO = conf_dop['nn_bert']['drop_ratio']

MODE = conf_dop['nn_bert']['mode'] # pooler

LEARNING_RATE = conf_dop['nn']['learning_rate']
EPOCH_NUM = conf_dop['nn']['epoch_num']

exp_gamma = conf_dop['nn']['exp_gamma'] 
milestone_gamma = conf_dop['nn']['milestone_gamma']
milestone_l = conf_dop['nn']['milestone_l'] 
l2 = conf_dop['nn']['l2'] 

## загрузка датасета и формирование target

In [10]:
mlb = joblib.load(conf['prep_text']['mlb_fn'])
data = pd.read_csv(conf['feat_gen']['data_fn'])

data['target'] = data['target'].map(lambda x: eval(x))
data['labels'] = data['labels'].map(lambda x: eval(x))
tr_idx = data.query('split=="tr"').index
val_idx = data.query('split=="val"').index
ts_idx = data.query('split=="ts"').index

# Моделирование

## загрузка модели

In [11]:
bert_type = conf_dop['nn_bert']['bert_type']

In [12]:
if bert_type == 'secbert_plus':
    
    checkpoint = "ehsanaghaei/SecureBERT_Plus"
    tokenizer = RobertaTokenizer.from_pretrained(checkpoint)
    model = RobertaModel.from_pretrained(checkpoint, output_hidden_states=True)
    tokenizer_opts = {'max_length':MAX_SEQ_LENGTH, 'return_tensors':"pt", 'padding':True, 'truncation':True, 'add_special_tokens':True}
    bert_model= model
    
elif bert_type == 'secbert':

    checkpoint = "jackaduma/SecBERT"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    tokenizer_opts = {'max_length':MAX_SEQ_LENGTH, 'return_tensors':"pt", 'padding':True, 'truncation':True, 'add_special_tokens':True}
    config = BertConfig.from_pretrained(checkpoint, output_hidden_states=True)
    bert_model = AutoModel.from_pretrained(checkpoint, config=config).base_model

elif bert_type == 'scibert':
    checkpoint = 'allenai/scibert_scivocab_uncased'
    tokenizer = BertTokenizer.from_pretrained(checkpoint, max_length=512)
    model = BertForSequenceClassification.from_pretrained('scibert_multi_label_model')
    bert_model= model.bert
    tokenizer_opts = {'return_tensors':"pt", 'truncation':True,
                      'padding':'max_length', 'max_length':MAX_SEQ_LENGTH}



## подготовка данных

In [15]:
from torch.utils.data import Dataset

class TextDFDataset(Dataset):

    def __init__(self, df, tokenizer, tokenizer_opts):

      self.df = df
      self.tokenizer = tokenizer
      self.tokenizer_opts = tokenizer_opts 
        
    def __getitem__(self, idx):
      # import pdb;pdb.set_trace()
      tok_d = self.tokenizer(self.df.loc[self.df.index[idx], 'sentence'], **self.tokenizer_opts)

      return {**{k:v.flatten() for k,v in tok_d.items()}, **{'target':torch.tensor(self.df.loc[self.df.index[idx], 'target'], dtype=torch.float)}}

    def __len__(self):

      return self.df.shape[0]

tr_ds = TextDFDataset(data.query('split=="tr"').reset_index(drop=True), tokenizer =tokenizer, tokenizer_opts=tokenizer_opts)
val_ds = TextDFDataset(data.query('split=="val"').reset_index(drop=True), tokenizer =tokenizer, tokenizer_opts=tokenizer_opts)
ts_ds = TextDFDataset(data.query('split=="ts"').reset_index(drop=True), tokenizer =tokenizer, tokenizer_opts=tokenizer_opts)


In [16]:
tr_ld = DataLoader(tr_ds, batch_size = TRAIN_BATCH_SIZE, shuffle = True, collate_fn = DataCollatorWithPadding(tokenizer=tokenizer))
val_ld = DataLoader(val_ds, batch_size = VALID_BATCH_SIZE, shuffle = False, collate_fn = DataCollatorWithPadding(tokenizer=tokenizer))
ts_ld = DataLoader(ts_ds, batch_size = VALID_BATCH_SIZE, shuffle = False, collate_fn = DataCollatorWithPadding(tokenizer=tokenizer))

### проверка

In [17]:
batch = next(iter(val_ld))


## архитектура модели

In [18]:
len(mlb.classes_)

14

In [19]:
class TextModelClass(torch.nn.Module):

    def __init__(self, classnum, dropout_ratio):

        super().__init__()
        self.bert = bert_model
        self.lin = torch.nn.Linear(768, 768)
        self.drop_out = torch.nn.Dropout(dropout_ratio)
        self.lin_out = torch.nn.Linear(768, classnum)

    def forward(self, X):

        out = self.bert(**{k: v.to(DEVICE) for k, v in X.items() if k!='target'})
        if MODE=='cls':
            out = out.last_hidden_state[:,0,:]
        elif MODE=='pooler':
            out = out.pooler_output
        out = self.lin(out)
        out = torch.nn.ReLU()(out)
        out = self.drop_out(out)
        out = self.lin_out(out)

        return out

model = TextModelClass(classnum=len(mlb.classes_), dropout_ratio=DROPOUT_RATIO)
model = model.to(DEVICE)



In [20]:
# Заморозьте все слои
for param in model.bert.parameters():
    param.requires_grad = False

### проверка

out = model(batch)
out

## обучение

In [21]:
from tqdm.auto import tqdm
from torch.optim.lr_scheduler import StepLR, ExponentialLR, MultiStepLR
from sklearn.metrics import average_precision_score, log_loss

loss_fn = torch.nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=l2)
# optimizer = torch.optim.RMSprop(model.parameters(), lr=LEARNING_RATE)

scheduler1 = ExponentialLR(optimizer, gamma=conf_dop['nn']['exp_gamma'] )
scheduler2 = MultiStepLR(optimizer, milestones=conf_dop['nn']['milestone_l'], gamma=conf_dop['nn']['milestone_gamma'])


val_iter_num = 1
refresh_cache_iter = 10
progress_bar = tqdm(range(EPOCH_NUM))

loss_d = {}

model = model.to(DEVICE)

for epoch in range(1, EPOCH_NUM+1):
    loss_tr_l = []
    model.train()
    res_d = defaultdict(list)
    tr_batch_num = len(tr_ld)
    tr_loss_epoch = 0
    for batch_tr in tr_ld:
        out = model(batch_tr)
        # import pdb;pdb.set_trace()
        optimizer.zero_grad()

        loss = loss_fn(out, batch_tr['target'].to(DEVICE))
        loss.backward()
        optimizer.step()
        tr_loss_epoch = tr_loss_epoch + loss.item()
        with torch.no_grad():
            model.eval()
            res_d['tr_target'].append(batch_tr['target'].numpy()) 
            res_d['tr_pred'].append(out.sigmoid().cpu().numpy()) 

    res_d['tr_target'] = list(chain(*res_d['tr_target']))
    res_d['tr_pred'] = list(chain(*res_d['tr_pred']))
    
    scheduler1.step()
    scheduler2.step()
    
    if epoch%val_iter_num==0:
        model.eval()
        val_batch_num = len(val_ld)
        val_loss_epoch = 0
        pr_auc = 0
        with torch.no_grad():
            for batch_val in val_ld:
                pred = model(batch_val)
                val_loss = loss_fn(pred, batch_val['target'].to(DEVICE))
                val_loss_epoch = val_loss_epoch+val_loss.item()
                sigm_preds = pred.sigmoid().cpu()

                pr_auc = pr_auc + metric_multi(batch_val['target'].numpy(), sigm_preds.numpy(), average_precision_score)[0]

                res_d['target'].append(batch_val['target'].numpy())
                res_d['pred'].append(sigm_preds.numpy())

            res_d['target'] = list(chain(*res_d['target']))
            res_d['pred'] = list(chain(*res_d['pred']))

    # loss_d[epoch] = {'loss_tr':tr_loss_epoch/tr_batch_num,
    #                   'loss_val':val_loss_epoch/val_batch_num,
    #                 'pr_auc_batch':pr_auc/val_batch_num,
    #                  'pr_auc':metric_multi(np.array(res_d['target']), np.array(res_d['pred']), average_precision_score)[0]}

    loss_d[epoch] = {'log_loss_tr_batch':tr_loss_epoch/tr_batch_num,
                      'log_loss_val_batch':val_loss_epoch/val_batch_num,
                    'pr_auc_batch':pr_auc/val_batch_num,
                     'log_loss_val':metric_multi(np.array(res_d['target']), np.array(res_d['pred']), log_loss)[0],
                    'log_loss_tr':metric_multi(np.array(res_d['tr_target']), np.array(res_d['tr_pred']), log_loss)[0],
                     'pr_auc_val':metric_multi(np.array(res_d['target']), np.array(res_d['pred']), average_precision_score)[0],
                    'pr_auc_tr':metric_multi(np.array(res_d['tr_target']), np.array(res_d['tr_pred']), average_precision_score)[0]}
    progress_bar.update(1)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [10:59<00:00, 329.20s/it]

In [73]:
name = os.path.basename(conf_dop['nn_bert']['model_fn'])
dirname = os.path.dirname(conf_dop['nn_bert']['model_fn'])

torch.save(model, f'{dirname}/{bert_type}_{name}')

In [74]:
model = torch.load(f'{dirname}/{bert_type}_{name}')
# saving old one

os.rename(f'{dirname}/{bert_type}_{name}', f'{dirname}/prev_{bert_type}_{name}')

In [None]:
import matplotlib.pyplot as plt

plt.plot(loss_d.keys(),[it['log_loss_tr'] for it in loss_d.values()], label='log_loss_tr')
plt.plot(loss_d.keys(),[it['log_loss_val'] for it in loss_d.values()], label='log_loss_val')
plt.legend()

In [None]:
import matplotlib.pyplot as plt

plt.plot(loss_d.keys(),[it['pr_auc_tr'] for it in loss_d.values()], label='pr_auc_tr')
plt.plot(loss_d.keys(),[it['pr_auc_val'] for it in loss_d.values()], label='pr_auc_val')
plt.legend()

In [None]:
[it['pr_auc'] for it in loss_d.values()]

In [None]:
[it['loss_val'] for it in loss_d.values()]

In [None]:
[it['loss_tr'] for it in loss_d.values()]

# Дотренировка

In [None]:
# !mkdir '/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models'

In [None]:
# torch.save(model, 'data/models/jackaduma_model130.pt')
torch.save(model, 'data/models/secbertplus7.pt')
# torch.save(model, '/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models/allenai_model12.pt')

In [60]:
# model= torch.load('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models/jackaduma_model15.pt')
model = torch.load(conf_dop['nn_bert']['model_fn'])

In [63]:
name = os.path.basename(conf_dop['nn_bert']['model_fn'])
dirname = os.path.dirname(conf_dop['nn_bert']['model_fn'])
os.rename(conf_dop['nn_bert']['model_fn'], f'{dirname}/prev_{name}')

In [71]:
with pd.option_context('display.max_colwidth', 200):
    display(tram_df.head())

Unnamed: 0,sentence,labels
0,"title: NotPetya Technical Analysis – A Triple Threat: File Encryption, MFT Encryption, Credential Theft url: https://www.crowdstrike.com/blog/petrwrap-ransomware-technical-analysis-triple-threat-f...",[]
1,Executive Summary This technical analysis provides an in-depth analysis and review of NotPetya.,[]
2,For more information on CrowdStrike’s proactive protection features see the earlier CrowdStrike blog on how Falcon Endpoint Protection prevents the NotPetya attack.,[]
3,NotPetya combines ransomware with the ability to propagate itself across a network.,[]
4,"It spreads to Microsoft Windows machines using several propagation methods, including the EternalBlue exploit for the CVE-2017-0144 vulnerability in the SMB service.",[T1210]


In [68]:
tram_df = pd.read_json(conf['get_data']['tram_fn']).drop(columns='doc_title')
# tram_df[(tram_df.sentence.str.contains("5.")) & (tram_df.sentence.str.len()<10)]

Unnamed: 0,sentence,labels
1235,Figure 5.,[]
1584,Figure 5.,[]
1944,Figure 5.,[]
3093,T1059,[]
3110,T1562.001,[]
3112,T1564.003,[]
3362,Figure 5.,[]
4392,Figure 5.,[]
4612,Figure 5.,[]
4816,[T1059,[]


# Проверка предсказаний

In [None]:
res_val = get_preds(model, ld=val_ld)
res_tr = get_preds(model, ld=tr_ld)

In [None]:
res_ts = get_preds(model, ld=ts_ld)

In [None]:
fin_metric, res_l = metric_multi(np.array(res_val['target']), np.array(res_val['pred']), average_precision_score)
print(fin_metric)
res_val_df = pd.DataFrame({'qual':res_l, 'class':mlb.classes_})

fin_metric, res_l = metric_multi(np.array(res_tr['target']), np.array(res_tr['pred']), average_precision_score)
print(fin_metric)
res_tr_df = pd.DataFrame({'qual':res_l, 'class':mlb.classes_})

fin_metric, res_l = metric_multi(np.array(res_ts['target']), np.array(res_ts['pred']), average_precision_score)
print(fin_metric)
res_ts_df = pd.DataFrame({'qual':res_l, 'class':mlb.classes_})

In [None]:
res_ts_df.query('qual<0.8')

In [None]:
res_val_df.query('qual>0.4')['qual']

In [None]:
df.drop(columns='target').explode('labels').pivot_table(index = ['split'], columns = ['labels'], values='labels', aggfunc='count' )