In [22]:
import numpy as np
import tcbench as tcb
from scipy import stats
from functions import (extract_results_splits)

In [2]:
vectors_baseline = np.load('artifacts-mirage19/baseline_vectors.npy')
labels_baseline = np.load('artifacts-mirage19/baseline_labels.npy')
vectors_embeddings = np.load('artifacts-mirage19/embeddings_vectors.npy')
labels_embeddings = np.load('artifacts-mirage19/embeddings_labels.npy')

In [3]:
df_split = tcb.load_parquet(tcb.DATASETS.MIRAGE19, min_pkts=10, split=True)
train_test_splits = [
    (np.concatenate([df_split.iloc[0][0], df_split.iloc[0][1]]), df_split.iloc[0][2]),
    (np.concatenate([df_split.iloc[1][0], df_split.iloc[1][1]]), df_split.iloc[1][2]),
    (np.concatenate([df_split.iloc[2][0], df_split.iloc[2][1]]), df_split.iloc[2][2]),
    (np.concatenate([df_split.iloc[3][0], df_split.iloc[3][1]]), df_split.iloc[3][2]),
    (np.concatenate([df_split.iloc[4][0], df_split.iloc[4][1]]), df_split.iloc[4][2])
]

In [4]:
samples = [10, 100, 1000, None]

In [None]:
coverage_results_baseline_splits, accuracy_results_baseline_splits = extract_results_splits(vectors_baseline, labels_baseline, train_test_splits, 'euclidean', 'distance', samples, [None])
coverage_results_embeddings_splits, accuracy_results_embeddings_splits = extract_results_splits(vectors_embeddings, labels_embeddings, train_test_splits, 'euclidean', 'distance', samples, [None])
coverage_results_embeddings_cosine_splits, accuracy_results_embeddings_cosine_splits = extract_results_splits(vectors_embeddings, labels_embeddings, train_test_splits, 'cosine', 'similarity', samples, [None])

In [19]:
def mean_confidence_interval(data, confidence=0.95):
    a = np.array(data)
    n = len(a)
    m = np.mean(a)
    se = stats.sem(a)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

In [24]:
for k in [10, 100, 1000, None]:
    values = accuracy_results_baseline_splits[k][None]
    mean, lower, upper = mean_confidence_interval(values)
    error = upper - mean
    print(f"Sample {k}: {mean:.4f} ± {error:.4f} (95% CI)")

Sample 10: 25.0954 ± 0.2904 (95% CI)
Sample 100: 39.9333 ± 0.1671 (95% CI)
Sample 1000: 55.8146 ± 0.1399 (95% CI)
Sample None: 63.2315 ± 0.2206 (95% CI)


In [25]:
for k in [10, 100, 1000, None]:
    values = accuracy_results_embeddings_splits[k][None]
    mean, lower, upper = mean_confidence_interval(values)
    error = upper - mean
    print(f"Sample {k}: {mean:.4f} ± {error:.4f} (95% CI)")

Sample 10: 55.6871 ± 0.4059 (95% CI)
Sample 100: 66.6990 ± 0.2328 (95% CI)
Sample 1000: 74.8875 ± 0.1937 (95% CI)
Sample None: 78.5915 ± 0.1567 (95% CI)


In [26]:
for k in [10, 100, 1000, None]:
    values = accuracy_results_embeddings_cosine_splits[k][None]
    mean, lower, upper = mean_confidence_interval(values)
    error = upper - mean
    print(f"Sample {k}: {mean:.4f} ± {error:.4f} (95% CI)")

Sample 10: 61.5756 ± 0.4648 (95% CI)
Sample 100: 69.7370 ± 0.2483 (95% CI)
Sample 1000: 76.2502 ± 0.2218 (95% CI)
Sample None: 79.5513 ± 0.2323 (95% CI)
