In [None]:
import numpy as np
import tcbench as tcb
import faiss
import matplotlib.pyplot as plt
#import build_faiss_index from faiss

In [2]:
vectors_baseline = np.load('artifacts-mirage19/baseline_vectors.npy')
labels_baseline = np.load('artifacts-mirage19/baseline_labels.npy')

vectors_embeddings = np.load('artifacts-mirage19/embeddings_vectors.npy')
labels_embeddings = np.load('artifacts-mirage19/embeddings_labels.npy')

In [3]:
df_split = tcb.load_parquet(tcb.DATASETS.MIRAGE19, min_pkts=10, split=True)
train_val_indices = np.concatenate([df_split.iloc[0][0], df_split.iloc[0][1]])
test_indices = df_split.iloc[0][2]

In [None]:
def build_faiss_index(data, labels, train_indices, distance_type, values_per_class):
    np.random.seed(42)

    filtered_data = data[train_indices]
    filtered_labels = labels[train_indices]
    d = filtered_data.shape[1]

    selected_data = []
    selected_indices = []
    unique_labels = np.unique(filtered_labels)
    for label in unique_labels:
        label_indices = np.where(filtered_labels == label)[0]
        if values_per_class is None or values_per_class == -1:
            selected_label_indices = label_indices
        else:
            selected_label_indices = np.random.choice(label_indices, size=min(values_per_class, len(label_indices)), replace=False)
        selected_data.append(filtered_data[selected_label_indices])
        selected_indices.extend(train_indices[selected_label_indices])
    selected_data = np.vstack(selected_data)
    selected_indices = np.array(selected_indices)

    if distance_type == 'euclidean':
        index = faiss.IndexFlatL2(d)
    elif distance_type == 'cosine':
        norms = np.linalg.norm(selected_data, axis=1, keepdims=True)
        selected_data = selected_data / norms
        index = faiss.IndexFlatIP(d)
    else:
        raise ValueError("Unsupported distance type. Use 'euclidean' or 'cosine'.")

    index.add(selected_data)
    return index, selected_indices

In [27]:
def search_and_compare_labels(data, labels, test_indices, selected_indices, index, distance_limit):
    k = 5
    D, I = index.search(data[test_indices], k)
    test_labels = labels[test_indices]
    neighbor_labels = labels[selected_indices[I.flatten()]].reshape(I.shape)

    top_1_matches = 0
    top_3_matches = 0
    top_5_matches = 0
    classified_count = 0

    for i in range(len(test_labels)):
        if distance_limit is not None and D[i, 0] > distance_limit:
            continue

        classified_count += 1
        top_1_match = test_labels[i] in neighbor_labels[i, :1]
        top_3_match = test_labels[i] in neighbor_labels[i, :3]
        top_5_match = test_labels[i] in neighbor_labels[i, :5]

        if top_1_match:
            top_1_matches += 1
        if top_3_match:
            top_3_matches += 1
        if top_5_match:
            top_5_matches += 1

    if classified_count > 0:
        top_1_percentage = (top_1_matches / classified_count) * 100
        top_3_percentage = (top_3_matches / classified_count) * 100
        top_5_percentage = (top_5_matches / classified_count) * 100
    else:
        top_1_percentage = top_3_percentage = top_5_percentage = 0
    
    classified_percentage = (classified_count / len(test_labels)) * 100

    return classified_percentage, top_1_percentage, top_3_percentage, top_5_percentage

In [34]:
index_baseline, selected_indices = build_faiss_index(vectors_baseline, labels_baseline, train_val_indices, 'euclidean', None)
classified_baseline, top_1_baseline, top_3_baseline, top_5_baseline = search_and_compare_labels(vectors_baseline, labels_baseline, test_indices, selected_indices, index_baseline, 0.0001)

print(f"Baseline euclidean classified percentage: {classified_baseline:.2f}%")
print(f"Baseline euclidean match in top 1 neighbor: {top_1_baseline:.2f}%")
print(f"Baseline euclidean match in top 3 neighbors: {top_3_baseline:.2f}%")
print(f"Baseline euclidean match in top 5 neighbors: {top_5_baseline:.2f}%")

Baseline euclidean classified percentage: 59.11%
Baseline euclidean match in top 1 neighbor: 79.39%
Baseline euclidean match in top 3 neighbors: 85.58%
Baseline euclidean match in top 5 neighbors: 87.93%
