In [1]:
import os
import sys
import torch
from transformers import BertTokenizer
import lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))  
os.path.abspath(os.path.join(os.getcwd(), os.pardir))

'/home/zhulin/workspace/Jack'

In [3]:
pl.seed_everything(42, workers=True)
torch.set_float32_matmul_precision(precision="high")

Seed set to 42


In [4]:
args = {
    "pretrain": '/home/zhulin/pretrain/bert_pretrain_uncased/',
    "model": "/home/zhulin/models/single_channel_transformer.ckpt",
    "dataset": "/home/zhulin/datasets/cdatasets.test.5.csv"
}

In [5]:
### load model
from core.predictor import SingleChannelPredictor
tokenizer = BertTokenizer.from_pretrained(args["pretrain"], use_fast=True)

ckpt = torch.load(args["model"])
predictor = SingleChannelPredictor(**ckpt["hyper_parameters"])
predictor.load_state_dict(ckpt["state_dict"])
predictor = predictor.eval().cuda()

In [6]:
for name, module in predictor.named_modules():
    print(name)


net
net.input_net
net.input_net.0
net.input_net.1
net.input_net.2
net.positional_encoding
net.transformer
net.transformer.layers
net.transformer.layers.0
net.transformer.layers.0.attn
net.transformer.layers.0.attn.qkv_proj
net.transformer.layers.0.attn.o_proj
net.transformer.layers.0.linear_net
net.transformer.layers.0.linear_net.0
net.transformer.layers.0.linear_net.1
net.transformer.layers.0.linear_net.2
net.transformer.layers.0.linear_net.3
net.transformer.layers.0.norm1
net.transformer.layers.0.norm2
net.transformer.layers.0.dropout
net.pooling_net
net.output_net
net.output_net.0
net.output_net.1
net.output_net.2
net.output_net.3


In [7]:
### load extractor
from common.extractor import FeatureExtractor

extractor = FeatureExtractor()
extractor.register(predictor, [
    "net.input_net.0",                      # nn.Embedding
    "net.input_net.2",                      # nn.Linear
    "net.pooling_net"                       # nn.MultiheadAttention
])

In [8]:
### load dataset
import numpy as np
import pandas as pd
import datatable as dt

data = dt.fread(args["dataset"], fill=True)
data = data[dt.f.index == "k5c7fb0927db37372da25f270708103a2", :]
data.head(3)

Unnamed: 0_level_0,C0,channel,index,label,pid,pname,unique_key
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪
0,1838633,RegistryQueryValue ThrottleDrege RegistryQueryValue ThrottleDrege RegistryQueryValue ThrottleDrege R…,k5c7fb0927db37372da25f270708103a2,1,1848,WMIADAP.exe,−1
1,1838634,RegistryQueryValue ThrottleDrege RegistryQueryValue ThrottleDrege RegistryQueryValue ThrottleDrege R…,k5c7fb0927db37372da25f270708103a2,1,1848,WMIADAP.exe,−1
2,1838635,RegistryQueryValue ThrottleDrege RegistryQueryValue ThrottleDrege RegistryQueryValue ThrottleDrege R…,k5c7fb0927db37372da25f270708103a2,1,1848,WMIADAP.exe,−1


In [9]:
from tqdm import tqdm

extractor.features.clear()
predictor = predictor.eval().cuda()
for i in tqdm(range(data.nrows)):
    padded_sent_seq = tokenizer(data[i, 1], padding=True, truncation=True, max_length=2048, return_tensors="pt")
    data_length = torch.tensor([sum(mask) for mask in padded_sent_seq["attention_mask"]])

    with torch.no_grad():
        pred = predictor(padded_sent_seq["input_ids"].cuda(), padded_sent_seq["attention_mask"].cuda(), data_length)

100%|██████████| 42698/42698 [03:47<00:00, 187.93it/s]


In [10]:
features_after_embedding = torch.cat([x.mean(dim=1) for x in extractor.features["net.input_net.0"]])
features_after_embedding.shape

torch.Size([42698, 64])

In [11]:
## t-sne降维分析
from sklearn.manifold import TSNE 
from matplotlib import pyplot as plt

features = features_after_embedding.cpu().numpy()

tsne = TSNE(n_components=2) 
x = tsne.fit_transform(features)  

# 设置图例样式
maker = ['o', 's', '^', 's', 'p', '*', '<', '>', 'D', 'd', 'h', 'H']
colors = ['#e38c7a', '#656667', '#99a4bc', 'cyan', 'blue', 'lime', 'r', 'violet', 'm', 'peru', 'olivedrab', 'hotpink']
labels = ['a', 'b', 'c', 'd']
font = {'family': 'Times New Roman', 'weight': 'bold', 'size': 32}

# 图像绘制
def plotlabels(x, y, labels, name):
    data = pd.DataFrame({'x': x, 'y': y, 'label': labels})
    for index in range(2):  # 假设总共有三个类别，类别的表示为0,1,2
        X = data.loc[data['label'] == index]['x']
        Y = data.loc[data['label'] == index]['y']
        plt.scatter(X, Y, cmap='brg', s=100, marker=maker[index], c=colors[index], edgecolors=colors[index], alpha=0.65)

        plt.xticks([])  # 去掉横坐标值
        plt.yticks([])  # 去掉纵坐标值
    plt.title(name, fontsize=32, fontweight='normal', pad=20)


fig = plt.figure(figsize=(10, 10))
plotlabels(features[:, 0], features[:, 1], data[:, dt.f.label].to_numpy())
plt.show(fig)

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.