In [17]:
import numpy as np
import tcbench as tcb
import faiss

In [18]:
vectors_baseline = np.load('artifacts-mirage19/baseline_vectors.npy')
labels_baseline = np.load('artifacts-mirage19/baseline_labels.npy')

vectors_embeddings = np.load('artifacts-mirage19/embeddings_vectors.npy')
labels_embeddings = np.load('artifacts-mirage19/embeddings_labels.npy')

In [19]:
df_split = tcb.load_parquet(tcb.DATASETS.MIRAGE19, min_pkts=10, split=True)
train_val_indices = np.concatenate([df_split.iloc[0][0], df_split.iloc[0][1]])
test_indices = df_split.iloc[0][2]

In [20]:
def build_faiss_index(data, train_indices, distance_type):
    filtered_data = data[train_indices]
    d = filtered_data.shape[1]
    
    if distance_type == 'euclidean':
        index = faiss.IndexFlatL2(d)
    elif distance_type == 'cosine':
        index = faiss.IndexFlatIP(d)
    else:
        raise ValueError("Unsupported distance type. Use 'euclidean' or 'cosine'.")

    index.add(filtered_data)
    return index

In [21]:
def search_and_compare_labels(data, labels, test_indices, train_indices, index, k=5):

    D, I = index.search(data[test_indices], k)

    test_labels = labels[test_indices]
    neighbor_labels = labels[train_indices[I.flatten()]].reshape(I.shape)

    top_1_matches = 0
    top_3_matches = 0
    top_5_matches = 0

    for i in range(len(test_labels)):
        top_1_match = test_labels[i] in neighbor_labels[i, :1]
        top_3_match = test_labels[i] in neighbor_labels[i, :3]
        top_5_match = test_labels[i] in neighbor_labels[i, :5]

        if top_1_match:
            top_1_matches += 1
        if top_3_match:
            top_3_matches += 1
        if top_5_match:
            top_5_matches += 1

    total_tests = len(test_labels)
    top_1_percentage = (top_1_matches / total_tests) * 100
    top_3_percentage = (top_3_matches / total_tests) * 100
    top_5_percentage = (top_5_matches / total_tests) * 100

    return top_1_percentage, top_3_percentage, top_5_percentage

In [22]:
index_baseline = build_faiss_index(vectors_baseline, train_val_indices, distance_type='euclidean')
top_1_baseline, top_3_baseline, top_5_baseline = search_and_compare_labels(vectors_baseline, labels_baseline, test_indices, train_val_indices, index_baseline, k=5)

print(f"Baseline match in top 1 neighbor: {top_1_baseline:.2f}%")
print(f"Baseline match in top 3 neighbors: {top_3_baseline:.2f}%")
print(f"Baseline match in top 5 neighbors: {top_5_baseline:.2f}%")

Baseline match in top 1 neighbor: 64.38%
Baseline match in top 3 neighbors: 74.17%
Baseline match in top 5 neighbors: 78.17%


In [23]:
index_embeddings = build_faiss_index(vectors_embeddings, train_val_indices, distance_type='cosine')
top_1_embeddings, top_3_embeddings, top_5_embeddings = search_and_compare_labels(vectors_embeddings, labels_embeddings, test_indices, train_val_indices, index_embeddings, k=5)

print(f"Embeddings match in top 1 neighbor: {top_1_embeddings:.2f}%")
print(f"Embeddings match in top 3 neighbors: {top_3_embeddings:.2f}%")
print(f"Embeddings match in top 5 neighbors: {top_5_embeddings:.2f}%")

Embeddings match in top 1 neighbor: 20.29%
Embeddings match in top 3 neighbors: 41.63%
Embeddings match in top 5 neighbors: 48.29%
