1. mon_standard.pkl > array code



In [15]:
import pickle

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 95

# Load the pickle file
print("Loading datafile...")
with open("mon_standard.pkl", 'rb') as fi: # Path to mon_standard.pkl in Colab
    data = pickle.load(fi)

X1_mon = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2_mon = [] # Array to store instances (direction*size) - size information
y_mon = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1_mon.append(time_seq)
        X2_mon.append(size_seq)
        y_mon.append(label)
size = len(y_mon)

print(f'Total samples: {size}') # Output: 19000


Loading datafile...
Total samples: 1900


2. unmon_standard10.pkl > array code

In [16]:
import pickle
import numpy as np

TOTAL_URLS = 300  # total number in the dataset

# Load 10,000 unmon pickle file
print("Loading datafile...")
with open('unmon_standard10_3000.pkl', 'rb') as f:  # Path to unmon_standard10.pkl in Colab
    x = pickle.load(f)

size = len(x)
print(f'Total samples: {size}')

X1_unmon = [] # Array to store instances (timestamps) - 10,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2_unmon = [] # Array to store instances (direction*size) - size information

for i in range(TOTAL_URLS):
    size_seq = []
    time_seq = []
    for c in x[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
    X1_unmon.append(np.array(time_seq, dtype=np.int32))
    X2_unmon.append(np.array(size_seq, dtype=np.int16))


print(len(X1_unmon)) # Print the length of X1

print(X2_unmon[0-10])

Loading datafile...
Total samples: 3000
300
[-512 -512  512 ... -512 -512 -512]


### Data Preprocessing ###

#### Remove corrupted/incomplete traces


In [17]:
def clean(X1, X2, y=None):
    X1_clean, X2_clean, y_clean = [], [], []
    for i in range(len(X1)):
        if len(X1[i]) > 0 and len(X1[i]) == len(X2[i]): # non-empty & matching lengths
            X1_clean.append(X1[i])
            X2_clean.append(X2[i])
            if y is not None:
                y_clean.append(y[i])
    return (X1_clean, X2_clean, y_clean) if y is not None else (X1_clean, X2_clean)

# clean monitored
X1_mon, X2_mon, y_mon = clean(X1_mon, X2_mon, y_mon)
print("Clean monitored traces:", len(X1_mon))

# clean unmonitored
X1_unmon, X2_unmon = clean(X1_unmon, X2_unmon)
print("Clean unmonitored traces:", len(X1_unmon))

Clean monitored traces: 1900
Clean unmonitored traces: 300


#### Normalize timestamps to start at 0

In [18]:
def normalize_timestamps(X1):
    return [[t - seq[0] for t in seq] for seq in X1] # subtract by first seq value for each value to see how much time passed in each packet

X1_mon = normalize_timestamps(X1_mon)
X1_unmon = normalize_timestamps(X1_unmon)

#### Truncate or pad sequences to certain length

In [19]:
import numpy as np

MAX_LEN = 10000

def pad_truncate(seq, max_len=10000):
    seq = list(seq)  # assure une liste Python

    if len(seq) > max_len:
        return seq[:max_len]

    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))

    return seq

# --- MONITORED --------------------------------------------------------------
X1_mon = np.array([pad_truncate(s, MAX_LEN) for s in X1_mon])
X2_mon = np.array([pad_truncate(s, MAX_LEN) for s in X2_mon])
y_mon = np.array(y_mon)

# --- UNMONITORED ------------------------------------------------------------
X1_unmon = np.array([pad_truncate(s, MAX_LEN) for s in X1_unmon])
X2_unmon = np.array([pad_truncate(s, MAX_LEN) for s in X2_unmon])

print("\nMonitored timestamps:", X1_mon.shape)
print("Unmonitored timestamps:", X1_unmon.shape)



Monitored timestamps: (1900, 10000)
Unmonitored timestamps: (300, 10000)


#### Split data into training, testing, and validation datasets

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split

# 1) SCENARIO CLOSED WORLD

X1_train_cw, X1_temp_cw, X2_train_cw, X2_temp_cw, y_train_cw, y_temp_cw = train_test_split(
    X1_mon, X2_mon, y_mon,
    test_size=0.30,
    stratify=y_mon,
    random_state=42
)

X1_val_cw, X1_test_cw, X2_val_cw, X2_test_cw, y_val_cw, y_test_cw = train_test_split(
    X1_temp_cw, X2_temp_cw, y_temp_cw,
    test_size=0.50,
    stratify=y_temp_cw,
    random_state=42
)


X_train_cw = np.hstack((X1_train_cw, X2_train_cw))
X_val_cw   = np.hstack((X1_val_cw,   X2_val_cw))
X_test_cw  = np.hstack((X1_test_cw,  X2_test_cw))


# 2) SCENARIO OPEN WORLD (binary 0/1)

y_mon_binary = np.ones(len(X1_mon))        # Surveillé = 1
y_unmon_binary = np.zeros(len(X1_unmon))   # Non-surveillé = 0

# Combine les données
X1_all = np.vstack((X1_mon, X1_unmon))
X2_all = np.vstack((X2_mon, X2_unmon))
y_all  = np.concatenate((y_mon_binary, y_unmon_binary))

# Split open world
X1_train_ow, X1_temp_ow, X2_train_ow, X2_temp_ow, y_train_ow, y_temp_ow = train_test_split(
    X1_all, X2_all, y_all,
    test_size=0.30,
    stratify=y_all,
    random_state=42
)

X1_val_ow, X1_test_ow, X2_val_ow, X2_test_ow, y_val_ow, y_test_ow = train_test_split(
    X1_temp_ow, X2_temp_ow, y_temp_ow,
    test_size=0.50,
    stratify=y_temp_ow,
    random_state=42
)

# fusion embedding
X_train_ow = np.hstack((X1_train_ow, X2_train_ow))
X_val_ow   = np.hstack((X1_val_ow,   X2_val_ow))
X_test_ow  = np.hstack((X1_test_ow,  X2_test_ow))


## **KNN MODEL** (Alice)

In [24]:
from sklearn.preprocessing import StandardScaler

# --- Closed World Scaler ---
scaler_cw = StandardScaler()

# Fit only on training data
scaler_cw.fit(X_train_cw)

# Transform
X_train_cw_scaled = scaler_cw.transform(X_train_cw)
X_val_cw_scaled   = scaler_cw.transform(X_val_cw)
X_test_cw_scaled  = scaler_cw.transform(X_test_cw)

# --- Open World Scaler ---
scaler_ow = StandardScaler()

# Fit only on training data
scaler_ow.fit(X_train_ow)

# Transform
X_train_ow_scaled = scaler_ow.transform(X_train_ow)
X_val_ow_scaled   = scaler_ow.transform(X_val_ow)
X_test_ow_scaled  = scaler_ow.transform(X_test_ow)

In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

print("=== CLOSED WORLD====")

# KNN model
knn_cw = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)

# Train
knn_cw.fit(X_train_cw_scaled, y_train_cw)

# Validation accuracy
y_train_pred_cw = knn_cw.predict(X_train_cw_scaled)
y_val_pred_cw  = knn_cw.predict(X_val_cw_scaled)
y_test_pred_cw = knn_cw.predict(X_test_cw_scaled)


# Test accuracy
train_acc_cw = accuracy_score(y_train_cw, y_train_pred_cw)
val_acc_cw   = accuracy_score(y_val_cw, y_val_pred_cw)
test_acc_cw  = accuracy_score(y_test_cw, y_test_pred_cw)


print("Train accuracy       :", train_acc_cw)
print("Validation accuracy  :", val_acc_cw)
print("Test accuracy        :", test_acc_cw)

=== CLOSED WORLD====
Train accuracy       : 0.7894736842105263
Validation accuracy  : 0.6736842105263158
Test accuracy        : 0.6701754385964912


In [25]:
print("\n=== OPEN WORLD (binary 0=unmon, 1=mon) ===")

# KNN model
knn_ow = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)

# Train
knn_ow.fit(X_train_ow_scaled, y_train_ow)

y_train_pred_ow = knn_ow.predict(X_train_ow_scaled)
y_val_pred_ow   = knn_ow.predict(X_val_ow_scaled)
y_test_pred_ow  = knn_ow.predict(X_test_ow_scaled)

# --- Calcul accuracy ---
train_acc_ow = accuracy_score(y_train_ow, y_train_pred_ow)
val_acc_ow   = accuracy_score(y_val_ow, y_val_pred_ow)
test_acc_ow  = accuracy_score(y_test_ow, y_test_pred_ow)


print("Train accuracy       :", train_acc_ow)
print("Validation accuracy  :", val_acc_ow)
print("Test accuracy        :", test_acc_ow)



=== OPEN WORLD (binary 0=unmon, 1=mon) ===
Train accuracy       : 0.9227272727272727
Validation accuracy  : 0.8757575757575757
Test accuracy        : 0.8636363636363636


## **SVM (Alice)**

In [26]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time

print("=== CLOSED WORLD (SVM) ====")

svm_cw = SVC(
    kernel='rbf',
    gamma='scale',
    C=1.0,
    random_state=42,
    verbose=True
)

# --- Training ---
start_time = time.time()
print("Démarrage de l'entraînement...")
svm_cw.fit(X_train_cw_scaled, y_train_cw)
end_time = time.time()
print(f"Temps d'entraînement: {end_time - start_time:.2f} secondes")

# --- Predictions ---
y_train_pred_cw = svm_cw.predict(X_train_cw_scaled) # Training
y_val_pred_cw   = svm_cw.predict(X_val_cw_scaled)   # Validation
y_test_pred_cw  = svm_cw.predict(X_test_cw_scaled)  # Test

# --- Calcul accuracy ---
train_acc_cw = accuracy_score(y_train_cw, y_train_pred_cw)
val_acc_cw   = accuracy_score(y_val_cw, y_val_pred_cw)
test_acc_cw  = accuracy_score(y_test_cw, y_test_pred_cw)


print("\n--- RÉSULTATS ---")
print("Train accuracy       :", train_acc_cw)
print("Validation accuracy  :", val_acc_cw)
print("Test accuracy        :", test_acc_cw)

=== CLOSED WORLD (SVM) ====
Démarrage de l'entraînement...
[LibSVM]Temps d'entraînement: 79.28 secondes

--- RÉSULTATS ---
Train accuracy       : 0.8639097744360902
Validation accuracy  : 0.6456140350877193
Test accuracy        : 0.6736842105263158


In [27]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time

print("=== OPEN WORLD (SVM) ====")

svm_ow = SVC(
    kernel='rbf',
    gamma='scale',
    C=1.0,
    random_state=42,
    verbose=True
)

# --- Training ---
start_time = time.time()
print("Démarrage de l'entraînement...")
svm_ow.fit(X_train_ow_scaled, y_train_ow)
end_time = time.time()
print(f"Temps d'entraînement: {end_time - start_time:.2f} secondes")

# --- Predictions ---
y_train_pred_ow = svm_ow.predict(X_train_ow_scaled) # Training
y_val_pred_ow   = svm_ow.predict(X_val_ow_scaled)   # Validation
y_test_pred_ow  = svm_ow.predict(X_test_ow_scaled)  # Test

# --- Calcul accuracy ---
train_acc_ow = accuracy_score(y_train_ow, y_train_pred_ow)
val_acc_ow   = accuracy_score(y_val_ow, y_val_pred_ow)
test_acc_ow  = accuracy_score(y_test_ow, y_test_pred_ow)


print("\n--- RÉSULTATS ---")
print("Train accuracy       :", train_acc_ow)
print("Validation accuracy  :", val_acc_ow)
print("Test accuracy        :", test_acc_ow)

=== OPEN WORLD (SVM) ====
Démarrage de l'entraînement...
[LibSVM]Temps d'entraînement: 28.29 secondes

--- RÉSULTATS ---
Train accuracy       : 0.8974025974025974
Validation accuracy  : 0.8666666666666667
Test accuracy        : 0.8636363636363636
