# Настройка среды

In [None]:
!pip install datasets -q
!pip install transformers[torch] -q
!pip install accelerate -U -q
!pip install iterative-stratification

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which

In [None]:
!mkdir scibert_multi_label_model
!wget https://ctidtram.blob.core.windows.net/tram-models/multi-label-20230803/config.json -O scibert_multi_label_model/config.json
!wget https://ctidtram.blob.core.windows.net/tram-models/multi-label-20230803/pytorch_model.bin -O scibert_multi_label_model/pytorch_model.bin


--2024-07-23 05:56:41--  https://ctidtram.blob.core.windows.net/tram-models/multi-label-20230803/config.json
Resolving ctidtram.blob.core.windows.net (ctidtram.blob.core.windows.net)... 52.239.246.4
Connecting to ctidtram.blob.core.windows.net (ctidtram.blob.core.windows.net)|52.239.246.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2806 (2.7K) [application/json]
Saving to: ‘scibert_multi_label_model/config.json’


2024-07-23 05:56:41 (896 MB/s) - ‘scibert_multi_label_model/config.json’ saved [2806/2806]

--2024-07-23 05:56:41--  https://ctidtram.blob.core.windows.net/tram-models/multi-label-20230803/pytorch_model.bin
Resolving ctidtram.blob.core.windows.net (ctidtram.blob.core.windows.net)... 52.239.246.4
Connecting to ctidtram.blob.core.windows.net (ctidtram.blob.core.windows.net)|52.239.246.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 439900469 (420M) [application/octet-stream]
Saving to: ‘scibert_multi_label_model/pytorch_

# Препроцессинг

In [None]:
import pandas as pd
import numpy as np
SEED = 0
np.random.seed(SEED)
import joblib
from sklearn.metrics import average_precision_score


In [None]:
import torch
from torch.utils.data import DataLoader

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertConfig, AutoModel
from transformers import DataCollatorWithPadding


## функции

In [None]:
def metric_multi(y, y_pred, metric_fn, ignore_const_target=True, **kwargs):
  metric_l = []
  for i in range(y.shape[1]):
    if ignore_const_target:
        # if len(set(y[:, i]))!=1:
        if set(y[:, i])!={0}:
            metric = metric_fn(y[:, i], y_pred[:,i], **kwargs)
            metric_l.append(metric)
        elif (y_pred[:, i]>0.5).sum()>0:
            metric_l.append(0)
        else:
            metric_l.append(1)
    else:
        metric = metric_fn(y[:, i], y_pred[:,i], labels=[0,1], **kwargs)
        metric_l.append(metric)


  return np.mean(metric_l), metric_l

In [None]:
from itertools import chain
from collections import defaultdict

def get_preds(model, ld):

    model.eval()
    res_d = defaultdict(list)

    with torch.no_grad():
        for batch in ld:
            pred = model(batch)
            sigm_preds = pred.sigmoid().cpu()
            res_d['target'].append(batch['target'].numpy())
            res_d['pred'].append(sigm_preds.numpy())

    res_d['target'] = list(chain(*res_d['target']))
    res_d['pred'] = list(chain(*res_d['pred']))

    return res_d

## переменные


In [None]:
use_old_mitre = False
target = 'tactic' # 'tech', 'subtech'

TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32

# TRAIN_BATCH_SIZE = 4
# VALID_BATCH_SIZE = 4

DROPOUT_RATIO = 0.3

MODE = 'cls' # pooler

LEARNING_RATE = 1e-5
EPOCH_NUM = 10



## загрузка датасета и формирование target

In [None]:
tram_df = pd.read_json('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/scibert/multi_label.json').drop(columns='doc_title')


In [None]:
if use_old_mitre:
  mitr_df = joblib.load('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/mitre_alt.pkl')
else:
  mitr_df = joblib.load('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/mitre_source.pkl')

In [None]:
df = pd.concat([mitr_df, tram_df], ignore_index=True)

In [None]:
# rep_data  = joblib.load('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/rep_data.pkl')
# df = pd.concat([df, rep_data[['sentence', 'labels']]], ignore_index=True)

In [None]:
if target == 'tactic':
  from itertools import chain
  label2tactic = joblib.load('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/label2tactic.pkl')

  df['labels'] = df['labels'].map(lambda x: list(chain(*[label2tactic[it] for it in x])))

# joblib.dump(df, '/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/mitre_full_df.pkl')


___

In [None]:
# mitr_df = tram_df[tram_df['labels'].str.len()>1]

# df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/data_df.csv')
# df['labels'] = df['labels'].map(lambda x: eval(x))
# df

___

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

CLASSES = df.explode('labels')['labels'].dropna().unique()


In [None]:
mlb = MultiLabelBinarizer(classes=CLASSES)
mlb.fit([[c] for c in CLASSES])



In [None]:
CLASS_NUM = len(mlb.classes_)

In [None]:
df['target'] = mlb.transform(df['labels']).tolist()

## разбиение на выборки

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


val_ts_size = 0.15

mskf = MultilabelStratifiedKFold(n_splits=int(1/(2*val_ts_size)), shuffle=True, random_state=SEED)

# позиции от 0 до n
for tr_idx, val_ts_idx in mskf.split(df.values, np.array(df['target'].tolist())):
    break

mskf = MultilabelStratifiedKFold(n_splits=2, shuffle=True, random_state=SEED)

# позиции от 0 до m
for val_idx, ts_idx in mskf.split(df.iloc[val_ts_idx].values, np.array(df['target'].iloc[val_ts_idx].tolist())):
    break

val_idx = val_ts_idx[val_idx]
ts_idx = val_ts_idx[ts_idx]

df['split'] = 'tr'
df.loc[df.index[val_idx], 'split'] = 'val'
df.loc[df.index[ts_idx], 'split'] = 'ts'

In [None]:
df.drop(columns='target').explode('labels').pivot_table(index = ['split'], columns = ['labels'], values='labels', aggfunc='count' )

labels,collection,command-and-control,credential-access,defense-evasion,discovery,execution,exfiltration,impact,initial-access,lateral-movement,persistence,privilege-escalation,reconnaissance,resource-development
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
tr,1253,2559,969,5254,2964,2517,370,244,793,474,1719,2135,140,368
ts,52,106,41,218,122,105,15,10,33,20,71,88,6,15
val,52,108,40,216,125,109,16,10,33,20,73,91,6,15


In [None]:
tr_idx = df.query('split=="tr"').index
val_idx = df.query('split=="val"').index
ts_idx = df.query('split=="ts"').index

# Моделирование

## загрузка модели

In [None]:
checkpoint = 'allenai/scibert_scivocab_uncased'
tokenizer = BertTokenizer.from_pretrained(checkpoint, max_length=512)
model = BertForSequenceClassification.from_pretrained('scibert_multi_label_model')
bert_model= model.bert
tokenizer_opts = {'return_tensors':"pt", 'truncation':True,
                  'padding':'max_length', 'max_length':512}

# checkpoint = "jackaduma/SecBERT"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenizer_opts = {'max_length':512, 'return_tensors':"pt", 'padding':True, 'truncation':True, 'add_special_tokens':True}
# config = BertConfig.from_pretrained(checkpoint, output_hidden_states=True)
# bert_model = AutoModel.from_pretrained(checkpoint, config=config).base_model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Загружаю модель с библиотекой transformers приведенным ниже кодом, как файлы модели указать вручную

checkpoint = "jackaduma/SecBERT"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
bert_model = AutoModel.from_pretrained(checkpoint, config=config).base_model

## подготовка данных

In [None]:
from torch.utils.data import Dataset

class TextDFDataset(Dataset):

    def __init__(self, df):

      self.df = df

    def __getitem__(self, idx):
      # import pdb;pdb.set_trace()
      tok_d = tokenizer(self.df.loc[self.df.index[idx], 'sentence'], **tokenizer_opts)

      return {**{k:v.flatten() for k,v in tok_d.items()}, **{'target':torch.tensor(self.df.loc[self.df.index[idx], 'target'], dtype=torch.float)}}

    def __len__(self):

      return self.df.shape[0]

tr_ds = TextDFDataset(df.query('split=="tr"').reset_index(drop=True))
val_ds = TextDFDataset(df.query('split=="val"').reset_index(drop=True))
ts_ds = TextDFDataset(df.query('split=="ts"').reset_index(drop=True))


In [None]:
tr_ld = DataLoader(tr_ds, batch_size = TRAIN_BATCH_SIZE, shuffle = True, collate_fn = DataCollatorWithPadding(tokenizer=tokenizer))
val_ld = DataLoader(val_ds, batch_size = VALID_BATCH_SIZE, shuffle = False, collate_fn = DataCollatorWithPadding(tokenizer=tokenizer))
ts_ld = DataLoader(ts_ds, batch_size = VALID_BATCH_SIZE, shuffle = False, collate_fn = DataCollatorWithPadding(tokenizer=tokenizer))

### проверка

In [None]:
batch = next(iter(val_ld))


## архитектура модели

In [None]:
class TextModelClass(torch.nn.Module):

    def __init__(self):

        super().__init__()
        self.bert = bert_model
        self.lin = torch.nn.Linear(768, 768)
        self.drop_out = torch.nn.Dropout(DROPOUT_RATIO)
        self.lin_out = torch.nn.Linear(768, CLASS_NUM)

    def forward(self, X):

        out = self.bert(**{k: v.to(DEVICE) for k, v in X.items() if k!='target'})
        if MODE=='cls':
            out = out.last_hidden_state[:,0,:]
        elif MODE=='pooler':
            out = out.pooler_output
        out = self.lin(out)
        out = torch.nn.ReLU()(out)
        out = self.drop_out(out)
        out = self.lin_out(out)

        return out

model = TextModelClass()
model = model.to(DEVICE)



In [None]:
# Заморозьте все слои
for param in model.bert.parameters():
    param.requires_grad = False

### проверка

out = model(batch)
out

## обучение

In [None]:
from tqdm.auto import tqdm

EPOCH_NUM = 2
loss_fn = torch.nn.BCEWithLogitsLoss()

# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.RMSprop(model.parameters(), lr=LEARNING_RATE)

val_iter_num = 1
refresh_cache_iter = 10
progress_bar = tqdm(range(EPOCH_NUM))

loss_d = {}

model = model.to(DEVICE)

for epoch in range(1, EPOCH_NUM+1):
    loss_tr_l = []
    model.train()
    tr_batch_num = len(tr_ld)
    tr_loss_epoch = 0
    for batch_tr in tr_ld:
        out = model(batch_tr)
        # import pdb;pdb.set_trace()
        optimizer.zero_grad()


        loss = loss_fn(out, batch_tr['target'].to(DEVICE))
        loss.backward()
        optimizer.step()
        tr_loss_epoch = tr_loss_epoch + loss.item()


    if epoch%val_iter_num==0:
        model.eval()
        val_batch_num = len(val_ld)
        val_loss_epoch = 0
        pr_auc = 0
        res_d = defaultdict(list)
        with torch.no_grad():
            for batch_val in val_ld:
                pred = model(batch_val)
                val_loss = loss_fn(pred, batch_val['target'].to(DEVICE))
                val_loss_epoch = val_loss_epoch+val_loss.item()
                sigm_preds = pred.sigmoid().cpu()

                pr_auc = pr_auc + metric_multi(batch_val['target'].numpy(), sigm_preds.numpy(), average_precision_score)[0]

                res_d['target'].append(batch_val['target'].numpy())
                res_d['pred'].append(sigm_preds.numpy())

            res_d['target'] = list(chain(*res_d['target']))
            res_d['pred'] = list(chain(*res_d['pred']))

    loss_d[epoch] = {'loss_tr':tr_loss_epoch/tr_batch_num,
                      'loss_val':val_loss_epoch/val_batch_num,
                    'pr_auc_batch':pr_auc/val_batch_num,
                     'pr_auc':metric_multi(np.array(res_d['target']), np.array(res_d['pred']), average_precision_score)[0]}

    progress_bar.update(1)

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
[it['pr_auc'] for it in loss_d.values()]

[0.4956415581925097, 0.50878146183961]

In [None]:
[it['pr_auc_batch'] for it in loss_d.values()]

[0.8203437856396458, 0.8192513043248795]

In [None]:
[it['loss_val'] for it in loss_d.values()]

[0.10077805594518417, 0.10016990725587054]

# Дотренировка

In [None]:
# !mkdir '/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models'

In [None]:
# torch.save(model, '/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models/jackaduma_model15.pt')
torch.save(model, '/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models/allenai_model12.pt')

In [None]:
# model= torch.load('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models/jackaduma_model15.pt')
model= torch.load('/content/drive/MyDrive/Colab Notebooks/texts/sec_bert/data/models/allenai_model6.pt')

# Проверка предсказаний

In [None]:


res_val = get_preds(model, ld=val_ld)
res_tr = get_preds(model, ld=tr_ld)

In [None]:
fin_metric, res_l = metric_multi(np.array(res_val['target']), np.array(res_val['pred']), average_precision_score)

res_val_df = pd.DataFrame({'qual':res_l, 'class':mlb.classes_})

fin_metric, res_l = metric_multi(np.array(res_tr['target']), np.array(res_tr['pred']), average_precision_score)

res_tr_df = pd.DataFrame({'qual':res_l, 'class':mlb.classes_})

In [None]:
res_tr_df.query('qual<0.4')

Unnamed: 0,qual,class
6,0.261875,lateral-movement
9,0.364619,resource-development
11,0.182767,impact
12,0.344876,initial-access
13,0.351032,exfiltration


In [None]:
res_val_df.query('qual>0.4')['qual']

0     0.622477
1     0.511515
2     0.511755
3     0.453695
4     0.653399
5     0.459702
8     0.808540
12    0.427619
Name: qual, dtype: float64

In [None]:
df.drop(columns='target').explode('labels').pivot_table(index = ['split'], columns = ['labels'], values='labels', aggfunc='count' )

labels,collection,command-and-control,credential-access,defense-evasion,discovery,execution,exfiltration,impact,initial-access,lateral-movement,persistence,privilege-escalation,reconnaissance,resource-development
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
tr,1253,2559,969,5254,2964,2517,370,244,793,474,1719,2135,140,368
ts,52,106,41,218,122,105,15,10,33,20,71,88,6,15
val,52,108,40,216,125,109,16,10,33,20,73,91,6,15
