In [1]:
import os
import sys
import torch
from transformers import BertTokenizer
import lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))  
os.path.abspath(os.path.join(os.getcwd(), os.pardir))

'/home/zhulin/workspace/Jack'

In [3]:
pl.seed_everything(42, workers=True)
torch.set_float32_matmul_precision(precision="high")

Seed set to 42


In [4]:
args = {
    "pretrain": '/home/zhulin/pretrain/bert_pretrain_uncased/',
    "model": "/home/zhulin/models/single_channel_transformer.ckpt",
    "dataset": "/mnt/sdd1/data/zhulin/jack/cdatasets.test.5.csv"
}


In [5]:
### load model
from core.predictor import SingleChannelPredictor
tokenizer = BertTokenizer.from_pretrained(args["pretrain"], use_fast=True)

ckpt = torch.load(args["model"])
predictor = SingleChannelPredictor(**ckpt["hyper_parameters"])
predictor.load_state_dict(ckpt["state_dict"])
predictor = predictor.eval().cuda()

In [6]:
### load data
import datatable as dt

data = dt.fread(args["dataset"], fill=True)
data = data[(dt.f.index == "k5c7fb0927db37372da25f270708103a2") & (dt.f.pname == "!WannaDecryptor!.exe") , :]
data

Unnamed: 0_level_0,index,unique_key,pid,pname,label,channel,cnt
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪,▪▪▪▪▪▪▪▪,▪▪▪▪
0,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,ProcessStart !WannaDecryptor!.exe f ThreadStart 3024 ImageLoad C:\Users\Administrator\Desktop\!Wanna…,8
1,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,CallStack RtlDestroyMemoryBlockLookaside LdrFindEntryForAddress LdrFindEntryForAddress LdrResRelease…,8
2,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,CallStack RtlDestroyMemoryBlockLookaside LdrResRelease LdrSetAppCompatDllRedirectionCallback RtlSubA…,8
3,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,CallStack RtlDestroyMemoryBlockLookaside LdrResRelease LdrSetAppCompatDllRedirectionCallback RtlSubA…,8
4,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,ImageLoad C:\Windows\SysWOW64\KernelBase.dll CallStack PssWalkSnapshot RtlDestroyMemoryBlockLookasid…,8
5,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,CallStack PssWalkSnapshot RtlDestroyMemoryBlockLookaside LdrResRelease LdrSetAppCompatDllRedirection…,8
6,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,FileIOCreate CallStack PssWalkSnapshot PssWalkSnapshot RtlDestroyMemoryBlockLookaside LdrResRelease…,8
7,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,FileIOCreate CallStack PssWalkSnapshot PssWalkSnapshot RtlDestroyMemoryBlockLookaside LdrResRelease…,8
8,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,FileIOCleanup C:\Users\Administrator\Desktop\!WannaDecryptor!.exe FileIOClose C:\Users\Administrator…,8
9,k5c7fb0927db37372da25f270708103a2,2364563584,3532,!WannaDecryptor!.exe,1,CallStack PssWalkSnapshot RtlDestroyMemoryBlockLookaside LdrResRelease LdrSetAppCompatDllRedirection…,8


In [7]:
from common.extractor import FeatureExtractor

# input = "ProcessStart !WannaDecryptor!.exe"
input = "ImageLoad C:\Windows\SysWOW64\KernelBase.dll"
input = "CallStack RtlDestroyMemoryBlockLookaside LdrFindEntryForAddress LdrFindEntryForAddress"

extractor = FeatureExtractor()
extractor.register(predictor, ["net.transformer.layers.0.attn"])
predictor = predictor.eval().cuda()
padded_sent_seq = tokenizer(input, padding=True, truncation=True, max_length=2048, return_tensors="pt")
data_length = torch.tensor([sum(mask) for mask in padded_sent_seq["attention_mask"]])

with torch.no_grad():
    pred = predictor(padded_sent_seq["input_ids"].cuda(), padded_sent_seq["attention_mask"].cuda(), data_length)
pred

tensor([[0.1628]], device='cuda:0')

In [8]:
attentions = extractor.features["net.transformer.layers.0.attn"][0][-1]
tokens = tokenizer.convert_ids_to_tokens(padded_sent_seq["input_ids"][0])

In [9]:
print(attentions.shape)

torch.Size([1, 8, 22, 22])


In [10]:
from transformers import AutoTokenizer, AutoModel, utils
from bertviz import model_view, head_view
utils.logging.set_verbosity_error()  # Suppress standard warnings
tokenizer.add_special_tokens({ "additional_special_tokens": ["[unused1]", "[unused2]", "[unused3]"] })
inputs = tokenizer.encode(input, return_tensors='pt')  # Tokenize input text
print('inputs:', inputs)
print('tokens:', tokens)

html_head_view = head_view([attentions], tokens, html_action='return')

with open("head_view.html", 'w') as file:
    file.write(html_head_view.data)


inputs: tensor([[  101,     3,     5, 25510, 12881, 22254,  4765,  2854, 29278,  4215,
         16200,  4757, 25510, 12881, 22254,  4765,  2854, 29278,  4215, 16200,
          4757,   102]])
tokens: ['[CLS]', 'callstack', 'rtldestroymemoryblocklookaside', 'ld', '##rf', '##ind', '##ent', '##ry', '##for', '##ad', '##dre', '##ss', 'ld', '##rf', '##ind', '##ent', '##ry', '##for', '##ad', '##dre', '##ss', '[SEP]']


In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
### 绘制热力图
def attention_plot(attention, x_texts, y_texts=None, figsize=(15, 10), annot=False, path="./png"):
    plt.clf()
    fig, ax = plt.subplots(figsize=figsize)
    sns.set(font_scale=1.25)
    hm = sns.heatmap(attention,
                     cbar=True,
                     cmap="RdBu_r",
                     annot=annot,
                     square=True,
                     fmt='.2f',
                     annot_kws={'size': 10},
                     yticklabels=y_texts,
                     xticklabels=x_texts)
    plt.savefig(path)
    plt.close()


In [12]:
### 单头热力图
from tqdm import tqdm
from torch.nn import functional as F

attentions = o_features[0][1]
heads = attentions.shape[1]
texts = tokenizer.convert_ids_to_tokens(padded_sent_seq["input_ids"][0])
os.makedirs("./png", exist_ok=True)

# Attention 归一化
attentions = F.normalize(attentions, p=2, dim=-1)
# 创建 pandas DataFrame
import pandas as pd
for i in tqdm(range(heads)):
    pd.DataFrame(attentions[0, i, :, :].cpu().numpy()).to_csv(f"output{i}.csv", index=False, header=False)

NameError: name 'o_features' is not defined

In [None]:
!rm -f output0.csv
!rm -f output1.csv
!rm -f output2.csv
!rm -f output3.csv
!rm -f output4.csv
!rm -f output5.csv
!rm -f output6.csv
!rm -f output7.csv

In [None]:
# ==============================================================
# SUM
# ==============================================================
from torch.nn import functional as F
# 按Heads取和
attentions = o_features[0][1].sum(axis=1)
heads = attentions.shape[1]
texts = tokenizer.convert_ids_to_tokens(padded_sent_seq["input_ids"][0])
os.makedirs("./png", exist_ok=True)

# 显示Attention
# attention_plot(attentions[0, :, :].cpu(), annot=True, x_texts=texts, y_texts=texts, figsize=(15, 15), path=f"./png/sum_head.png")

# Attention 归一化
attentions = F.normalize(attentions, p=2, dim=-1)
# attention_plot(attentions[0, :, :].cpu(), x_texts=texts, y_texts=texts, annot=True, figsize=(15, 15), path=f"./png/sum_norm_head.png")

# 创建 pandas DataFrame
# import pandas as pd
# pd.DataFrame(attentions[0, :, :].cpu().numpy()).to_csv("output.csv", index=False, header=False)


['system',
 'sy',
 '##cr',
 'administrator',
 'callstack',
 'users',
 'process',
 'dll',
 '##dl',
 'threadstart',
 'dll',
 'imageload',
 '##dl',
 'imageload',
 'imageload',
 '##star']