In [1]:
import faiss
import numpy as np
import tcbench as tcb
from scipy.spatial.distance import cdist, pdist, squareform

In [2]:
vectors_baseline = np.load('/home/ev357/tcbench/src/fingerprinting/artifacts-mirage19/baseline_vectors.npy')
labels_baseline = np.load('/home/ev357/tcbench/src/fingerprinting/artifacts-mirage19/baseline_labels.npy')
vectors_embeddings = np.load('/home/ev357/tcbench/src/fingerprinting/artifacts-mirage19/embeddings_vectors.npy')
labels_embeddings = np.load('/home/ev357/tcbench/src/fingerprinting/artifacts-mirage19/embeddings_labels.npy')

In [3]:
def calculate_within_class_distances(feature_matrix, true_labels, metric):
    class_distances = {}
    unique_labels = np.unique(true_labels)

    for label in unique_labels:
        class_indices = np.where(true_labels == label)[0]
        class_features = feature_matrix[class_indices]
        distances = squareform(pdist(class_features, metric))
        class_distances[label] = distances

    return class_distances

def calculate_between_class_distances(feature_matrix, true_labels, metric):
    class_distances = {}
    unique_labels = np.unique(true_labels)

    for label in unique_labels:
        class_indices = np.where(true_labels == label)[0]
        other_indices = np.where(true_labels != label)[0]
        class_features = feature_matrix[class_indices]
        other_features = feature_matrix[other_indices]
        distances = cdist(class_features, other_features, metric)
        class_distances[label] = distances

    return class_distances

In [4]:
df_split = tcb.load_parquet(tcb.DATASETS.MIRAGE19, min_pkts=10, split=True)
train_val_indices = np.concatenate([df_split.iloc[0][0], df_split.iloc[0][1]])
test_indices = df_split.iloc[0][2]

In [5]:
def build_faiss_index(data, labels, train_indices, distance_type, samples_per_class = None):
    np.random.seed(42)

    filtered_data = data[train_indices]
    filtered_labels = labels[train_indices]
    d = filtered_data.shape[1]

    selected_data = []
    selected_indices = []
    unique_labels = np.unique(filtered_labels)
    for label in unique_labels:
        label_indices = np.where(filtered_labels == label)[0]
        if samples_per_class is None or samples_per_class == -1:
            selected_label_indices = label_indices
        else:
            selected_label_indices = np.random.choice(label_indices, size=min(samples_per_class, len(label_indices)), replace=False)
        selected_data.append(filtered_data[selected_label_indices])
        selected_indices.extend(train_indices[selected_label_indices])
    selected_data = np.vstack(selected_data)
    selected_indices = np.array(selected_indices)

    if distance_type == 'euclidean':
        index = faiss.IndexFlatL2(d)
    elif distance_type == 'cosine':
        selected_data = selected_data / np.linalg.norm(selected_data, axis=1, keepdims=True)
        index = faiss.IndexFlatIP(d)
    else:
        raise ValueError("Unsupported distance type. Use 'euclidean' or 'cosine'.")

    index.add(selected_data)
    return index, selected_indices

In [6]:
def search_and_compare_labels(data, labels, test_indices, selected_indices, index, metric, limit=None):
    k = 1
    if metric == "distance":
        D, I = index.search(data[test_indices], k)
    elif metric == "similarity":
        query_vectors = data[test_indices]
        query_vectors = query_vectors / np.linalg.norm(query_vectors, axis=1, keepdims=True)
        D, I = index.search(query_vectors, k)
    
    test_labels = labels[test_indices]
    neighbor_labels = labels[selected_indices[I.flatten()]].reshape(I.shape)
    
    matches = 0
    classified = 0
    min_distances = []

    for i in range(len(test_labels)):  
        if metric == "distance":
            min_distances.append(np.sqrt(D[i,0]))
            if limit is not None and np.sqrt(D[i, 0]) > limit:
                continue
        if metric == "similarity":
            min_distances.append(D[i,0])
            if limit is not None and D[i, 0] < limit:
                continue
        classified += 1
        matches += test_labels[i] in neighbor_labels[i, :1]

    classified_percentage = (classified / len(test_labels)) * 100
    match_percentage = (matches / classified) * 100 if classified > 0 else 0
    return classified_percentage, match_percentage, min_distances

In [7]:
print(labels_baseline[:2])
test_distances_within_baseline = calculate_within_class_distances(vectors_baseline[:2], labels_baseline[:2], metric='euclidean')
print(test_distances_within_baseline)

[0 0]
{0: array([[0.        , 1.01932718],
       [1.01932718, 0.        ]])}


In [9]:
test_index_baseline, test_selected_indices_baseline = build_faiss_index(vectors_baseline[:2], labels_baseline[:2], np.array([0]), 'euclidean', None)
test_classified_percentage_baseline, test_match_percentage_baseline, test_min_distances_baseline = search_and_compare_labels(vectors_baseline[:2], labels_baseline[:2], np.array([1]), test_selected_indices_baseline, test_index_baseline, "distance", None)
print(test_min_distances_baseline)

[1.0193273]


In [10]:
print(labels_embeddings[:2])
test_distances_between_embeddings = calculate_between_class_distances(vectors_embeddings[:2], labels_embeddings[:2], metric='euclidean')
print(test_distances_between_embeddings)

[11 18]
{11: array([[3.61261807]]), 18: array([[3.61261807]])}


In [11]:
test_index_embeddings, test_selected_indices_embeddings = build_faiss_index(vectors_embeddings[:2], labels_embeddings[:2], np.array([0]), 'euclidean', None)
test_classified_percentage_embeddings, test_match_percentage_embeddings, test_min_distances_embeddings = search_and_compare_labels(vectors_embeddings[:2], labels_embeddings[:2], np.array([1]), test_selected_indices_embeddings, test_index_embeddings, "distance", None)
print(test_min_distances_embeddings)

[3.612618]


In [12]:
print(labels_embeddings[:2])
test_distances_between_embeddings_cosine = calculate_between_class_distances(vectors_embeddings[:2], labels_embeddings[:2], metric='cosine')
print(test_distances_between_embeddings_cosine)
test_similarities_between_embeddings_cosine = {
        label: 1 - distances for label, distances in test_distances_between_embeddings_cosine.items()
    }
print(test_similarities_between_embeddings_cosine)
    

[11 18]
{11: array([[0.2227783]]), 18: array([[0.2227783]])}
{11: array([[0.7772217]]), 18: array([[0.7772217]])}


In [13]:
test_index_embeddings_cosine, test_selected_indices_embeddings_cosine = build_faiss_index(vectors_embeddings[:2], labels_embeddings[:2], np.array([0]), 'cosine', None)
test_classified_percentage_embeddings_cosine, test_match_percentage_embeddings_cosine, test_min_distances_embeddings_cosine = search_and_compare_labels(vectors_embeddings[:2], labels_embeddings[:2], np.array([1]), test_selected_indices_embeddings_cosine, test_index_embeddings_cosine, "similarity", None)
print(test_min_distances_embeddings_cosine)

[0.7772217]
