In [1]:
import numpy as np
from halutmatmul.halutmatmul import HalutMatmul
import torch
import torch.nn.functional as F

In [73]:
import numpy as np
import csv
from tqdm import tqdm

def read_csv_data(file_path, num_lines=3000000):
    data = []
    labels = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        for row in tqdm(reader, desc="Reading CSV"):
            # print(type(row[0]))
            if int(row[7]) < 8:
                continue
            labels.append(int(float(row[0])))  # 第一列是标签
            features = [float(x) for x in row[1:]]  # 后面的列是特征
            # indices = [1, 2] + list(range(5, 9)) + [] + list(range(17, 21)) + [23, 24, 27, 28, 31, 32, 35, 36, 39, 40] # 生成索引列表
            # features = [float(row[i]) for i in indices]  # 选择的列是特征
            data.append(features)
            if len(data) >= num_lines:
                break
    return data, labels

# 加载模型参数JSON文件
# json_file = '../simulate/model_int.json'
# json_file = '../simulate/model_oldz.json'


# with open(json_file, 'r') as file:
#     model_data = json.load(file)

# 解析模型参数
# layers_data = model_data['layers']

# weights = []
# biases = []

# for layer_data in layers_data:
#     weights.append(np.array(layer_data['weights']))
#     biases.append(np.array(layer_data['biases']))

# 创建模型实例
# model = MultiLayerPerceptron(weights, biases)

model_file = '../fingerprint_model.pth'
device = torch.device('cpu')  # 将模型转移到CPU设备
model_state_dict = torch.load(model_file, map_location=device)

# 准备训练数据
train_csv_file = '../dataset/fingerprint/unlimit/train_redeal.csv'  # 输入的 CSV 文件路径

train_data, train_labels = read_csv_data(train_csv_file)

# 准备测试数据
test_csv_file = '../dataset/fingerprint/unlimit/test_redeal.csv'  # 输入的 CSV 文件路径
test_data, test_labels = read_csv_data(test_csv_file)

# train_csv_file = '../dataset/botnet/train_redeal.csv'  # 输入的 CSV 文件路径
# train_data, train_labels = read_csv_data(train_csv_file,200000)

Reading CSV: 2175588it [00:14, 149919.60it/s]
Reading CSV: 699094it [00:02, 260431.38it/s]


In [74]:
def adjust_norm_layer(weight, bias, running_var, running_mean):
    """Adjusts the weight and bias of a normalization layer."""
    adjusted_weight = weight / np.sqrt(running_var + 1e-5)
    adjusted_bias = bias - running_mean * adjusted_weight
    return adjusted_weight, adjusted_bias

# 索引列表
layer_indexes = ['norm0', 'fc1', 'norm1', 'fc2', 'norm2', 'output']

# 初始化权重和偏差列表
weight = [model_state_dict[f'{index}.weight'].numpy().T for index in layer_indexes]
bias = [model_state_dict[f'{index}.bias'].numpy() for index in layer_indexes]

# 调整归一化层的权重和偏差
for i, index in enumerate(layer_indexes):
    if 'norm' in index:
        weight[i], bias[i] = adjust_norm_layer(
            weight[i], 
            bias[i], 
            model_state_dict[f'{index}.running_var'].numpy(), 
            model_state_dict[f'{index}.running_mean'].numpy()
        )


In [87]:
def fix_reset(x, num):
    for i in range(len(x)):
        for j in range(len(x[i])):
            if num < 0:
                x[i][j] = x[i][j] >> (-num) << (-num)


def get_layers(data):
    layers = []
    fix_point = [-1, -2, -2, -1]

    layers.append(np.array(data))

    # layers.append((layers[-1] - model_state_dict['norm0.running_mean'].numpy()) / np.sqrt(model_state_dict['norm0.running_var'].numpy() + 1e-5) * weight[0] + bias[0])
    layers.append(layers[-1] * weight[0] + bias[0])
    layers.append(np.maximum(layers[-1], 0))

    layers.append(np.dot(layers[-1], weight[1]) + bias[1])
    # layers.append((layers[-1] - model_state_dict['norm1.running_mean'].numpy()) / np.sqrt(model_state_dict['norm1.running_var'].numpy() + 1e-5) * weight[2] + bias[2])
    layers.append(layers[-1] * weight[2] + bias[2])
    layers.append(np.maximum(layers[-1], 0))

    layers.append(np.dot(layers[-1], weight[3]) + bias[3])
    # layers.append((layers[-1] - model_state_dict['norm2.running_mean'].numpy()) / np.sqrt(model_state_dict['norm2.running_var'].numpy() + 1e-5) * weight[4] + bias[4])
    layers.append(layers[-1] * weight[4] + bias[4])
    layers.append(np.maximum(layers[-1], 0))

    layers.append(np.dot(layers[-1], weight[5]) + bias[5])

    return layers


def reset_bias(hm, bias):
    lut = hm.luts
    for i in range(lut.shape[0]):
        for j in range(lut.shape[2]):
            lut[i][0][j] = lut[i][0][j] + bias[i]
    return lut


train_layers = get_layers(train_data)
test_layers = get_layers(test_data)

In [None]:
hm_1 = HalutMatmul(C=15, K=32)
hm_1.learn_offline(train_layers[0], weight[0])
lut1 = reset_bias(hm_1, model.biases[0])

In [None]:
hm_2 = HalutMatmul(C=32, K=32)
hm_2.learn_offline(train_layers[1], model.weights[1].T)
lut2 = reset_bias(hm_2, model.biases[1])

In [88]:
hm_3 = HalutMatmul(C=16, K=32)
hm_3.learn_offline(train_layers[-2], weight[5])
lut3 = reset_bias(hm_3, bias[5])

Learning simple k-means prototypes (2101216, 16)
Initializing simple k-means prototypes with zero
Training PQ slice 0/16
Sampling a subset of 4096 / 2101216 for training
Clustering 4096 points in 1D to 16 clusters, redo 1 times, 25 iterations
  Preprocessing in 0.06 s
  Iteration 24 (0.00 s, search 0.00 s): objective=18.0746 imbalance=8.891 nsplit=0       
Training PQ slice 1/16
Sampling a subset of 4096 / 2101216 for training
Clustering 4096 points in 1D to 16 clusters, redo 1 times, 25 iterations
  Preprocessing in 0.06 s
  Iteration 24 (0.01 s, search 0.00 s): objective=45.8367 imbalance=1.168 nsplit=0       
Training PQ slice 2/16
Sampling a subset of 4096 / 2101216 for training
Clustering 4096 points in 1D to 16 clusters, redo 1 times, 25 iterations
  Preprocessing in 0.05 s
  Iteration 24 (0.00 s, search 0.00 s): objective=30.3381 imbalance=1.257 nsplit=0       
Training PQ slice 3/16
Sampling a subset of 4096 / 2101216 for training
Clustering 4096 points in 1D to 16 clusters, re

In [93]:
temp = hm_3.splits_lists.copy()

In [117]:
hm_3.splits_lists = temp.copy()

In [120]:
hm_3.splits_lists = (hm_3.splits_lists * (2 ** 2)).astype(np.int8).astype(np.float32) / (2 ** 2)

In [118]:
for i in range(len(hm_3.splits_lists)):
    hm_3.splits_lists[i, :, :-3] = (hm_3.splits_lists[i, :, :-3] - bias[4][i]) / weight[4][i]

In [121]:
# for i in range(len(hm_3.splits_lists)):
#     hm_3.splits_lists[i] = (hm_3.splits_lists[i] - bias[4][i]) / weight[4][i]
# hm_3.splits_lists = (hm_3.splits_lists - bias[4]) / weight[4]
test_halt_3 = hm_3.matmul_online(test_layers[-4])
predicted_labels = np.argmax(test_halt_3, axis=1)
accuracy = np.mean(predicted_labels == test_labels) * 100
print(f"预测准确率: {accuracy:.4f}%")

预测准确率: 95.7728%


In [None]:
test_halt_1 = hm_1.matmul_online(np.array(test_data))
test_halt_2 = hm_2.matmul_online(test_halt_1)
test_halt_3 = hm_3.matmul_online(test_halt_2)

predicted_labels = np.argmax(test_halt_3, axis=1)
accuracy = np.mean(predicted_labels == test_labels) * 100
print(f"预测准确率: {accuracy:.4f}%")

In [None]:
# 40: 
# 6: log 64/2 = 6 
# 35: 32 + 3 后面有三个是dim scale offset，后面两个没啥用，dim是用来比较的维数
# 左开右闭
hm_2.splits_lists

In [None]:
# luts: (n, C, K), C个codebook, 每个codebook有K个centroid，luts里存的这个是centroid和对应向量乘后的值，n是矩阵乘法最终结果的维度
# 64: 矩阵大小40*64 的64
# 40: C=40 的40
# 64: K=64 的64
hm_1.luts.shape

In [None]:
def reset(hm, lut, x, y):
    t_last = (2 ** x)
    t_now = (2 ** y)
    # 量化hm_1.luts / 4为8位定点数
    hm.splits_lists = (hm.splits_lists * t_last).astype(np.int16).astype(np.float32) / t_last
    hm.luts = np.round(lut * t_now).astype(np.int16).astype(np.float32) / t_now
    return hm

In [None]:
hm_1 = reset(hm_1, lut1, 0, 1)
hm_2 = reset(hm_2, lut2, 1, 2)
hm_3 = reset(hm_3, lut3, 2, 2)

In [None]:
hm_1.splits_lists[19]

In [None]:
ans = []
def get_range(list, index, num, min, max):
    if num == 5:
        if max > min:
            ans.append([min, max, index])
        return
    # if list[num][index] > max:
    #     max = list[num][index]
    # else:
    #     min = list[num][index]
    get_range(list, index * 2, num + 1, min, list[num][index])
    get_range(list, index * 2 + 1, num + 1, list[num][index] + 1, max)

get_range(hm_3.splits_lists[0], 0, 0, -1, 1000)

In [None]:
ans

In [None]:
from utils import *

In [None]:
ans_16 = (np.array(ans) * 4).astype(np.int16)
feat_dict = {"f1": ans_16[1:]}
key_bits = {"1": 16}
key_encode_bits = {"f1": 14}

# 左闭右开
result = get_feature_table_entries(feat_dict, key_bits, key_encode_bits)

In [None]:
ans_16 = (np.array(ans) * 4).astype(np.int16)

In [None]:
ans_16