In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# Load dataset
dataset = "results/all_vec.csv"
data_list = np.genfromtxt(dataset)
data_list = data_list.astype(float)
print("Dataset size: ", data_list.shape)

In [None]:
# Set hyperparameters
train_encoding = 10000 # number of packets to train packet feature encoding on
train_clusters = 100000 # number of packets to train clustering module on
W_seg = 50 # framing length 
C = 10 # adjustment in log transform
K = 10 # number of clusters

In [None]:
# Get feature encodings using PCA
train_encoding_data = data_list[:train_encoding]
pca = PCA(n_components=20)
pca.fit(train_encoding_data)


In [None]:
# Transform the remaining data using the learned PCA
remaining_data = data_list[train_encoding:]
embedded_data = pca.transform(remaining_data)

In [None]:
n_packets = embedded_data.shape[0]
n_frames = n_packets // W_seg
print("Number of packets: ", n_packets)
print("Number of frames: ", n_frames)

In [None]:
# Initialize a list to store the modulus of DFT outputs
modulus_dft = []

# Perform DFT on each frame and calculate the modulus
for i in range(n_frames):
    frame = embedded_data[i*W_seg:(i+1)*W_seg]
    dft_output = np.fft.fft(frame)
    modulus_output = np.abs(dft_output)
    modulus_dft.append(modulus_output)
modulus_dft = np.array(modulus_dft)

print("Modulus DFT shape: ", modulus_dft.shape)

In [None]:
# Apply log transform to modulus of DFT outputs
log_modulus_dft = np.log2(modulus_dft + np.ones(modulus_dft.shape))/C

# Check for NaN and Inf values
nan_mask = np.isnan(log_modulus_dft)
inf_mask = np.isinf(log_modulus_dft)
print("NaN values: ", np.sum(nan_mask))
print("Inf values: ", np.sum(inf_mask))

# Replace NaN and Inf values with 0
log_modulus_dft = np.where(nan_mask | inf_mask, 0, log_modulus_dft)

In [None]:
train_clusters = 100000 // W_seg
train_data = np.mean(log_modulus_dft[:train_clusters], axis=2)
test_data = np.mean(log_modulus_dft[train_clusters:], axis=2)
all_data = np.mean(log_modulus_dft, axis=2)

# Fit KMeans clustering model on training data
kmeans = KMeans(n_clusters=20, n_init='auto')
kmeans.fit(train_data)

In [None]:
def get_l2_distances(dataset, kmeans):
    # Find the closest cluster center for each data point
    closest_cluster_centers = kmeans.cluster_centers_[kmeans.predict(dataset)]

    # Calculate the L2 distance between each data point and its closest cluster center
    l2_distances = np.linalg.norm(dataset - closest_cluster_centers, axis=1)
    print("L2 distances mean: ", np.mean(l2_distances))
    print("L2 distances std: ", np.std(l2_distances))
    return l2_distances

In [None]:
# Plot histogram of l2 distances
plt.hist(get_l2_distances(all_data, kmeans), bins=100)
plt.show()

In [None]:
plt.hist(get_l2_distances(test_data, kmeans), bins=100)
plt.show()

In [None]:
# Set threshold
threshold = 1.4

# Convert from frames back to packets
num_data_pts = data_list.shape[0]
l2_distances = get_l2_distances(all_data, kmeans)
l2_dist_packet = np.append(np.zeros(train_encoding),np.repeat(l2_distances, W_seg))
print("L2 distances shape: ", l2_dist_packet.shape)


In [None]:
labels = "mirai_labels.csv"
labels_list = np.genfromtxt(labels)
labels_list = labels_list.astype(float)
print("Labels shape: ", labels_list.shape)

In [None]:
anomaly_dist = []
normal_dist = []
anomaly_indices = []
for i in range(l2_dist_packet.shape[0]):
    if labels_list[i] == 1:
        anomaly_dist.append(l2_dist_packet[i])
        anomaly_indices.append(i)
    elif i > train_encoding:
        normal_dist.append(l2_dist_packet[i])

normal_dist = np.array(normal_dist)
anomaly_dist = np.array(anomaly_dist)
anomaly_indices = np.array(anomaly_indices)
np.save("results/normal_dist.npy", normal_dist)
np.save("results/anomaly_dist.npy", anomaly_dist)
np.save("results/anomaly_indices.npy", anomaly_indices)

In [None]:
# Calculate AUC of ROC curve
def get_roc_auc(normal_rmses, anomaly_rmses):
    n_normal = normal_rmses.shape[0]
    n_anomaly = anomaly_rmses.shape[0]
    roc_auc = 0
    for normal_rmse in normal_rmses:
        for anomaly_rmse in anomaly_rmses:
            if normal_rmse > anomaly_rmse:
                roc_auc += 1
    roc_auc /= (n_normal * n_anomaly)
    return roc_auc