1. mon_standard.pkl > array code



In [32]:
import pickle

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 95

# Load the pickle file
print("Loading datafile...")
with open("mon_standard.pkl", 'rb') as fi: # Path to mon_standard.pkl in Colab
    data = pickle.load(fi)

X1_mon = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2_mon = [] # Array to store instances (direction*size) - size information
y_mon = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1_mon.append(time_seq)
        X2_mon.append(size_seq)
        y_mon.append(label)
size = len(y_mon)

print(f'Total samples: {size}') # Output: 19000


Loading datafile...
Total samples: 1900


2. unmon_standard10.pkl > array code

In [33]:
import pickle
import numpy as np

TOTAL_URLS = 300  # total number in the dataset

# Load 10,000 unmon pickle file
print("Loading datafile...")
with open('unmon_standard10_3000.pkl', 'rb') as f:  # Path to unmon_standard10.pkl in Colab
    x = pickle.load(f)

size = len(x)
print(f'Total samples: {size}')

X1_unmon = [] # Array to store instances (timestamps) - 10,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2_unmon = [] # Array to store instances (direction*size) - size information

for i in range(TOTAL_URLS):
    size_seq = []
    time_seq = []
    for c in x[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
    X1_unmon.append(np.array(time_seq, dtype=np.int32))
    X2_unmon.append(np.array(size_seq, dtype=np.int16))


print(len(X1_unmon)) # Print the length of X1

print(X2_unmon[0-10])

Loading datafile...


Total samples: 3000
300
[-512 -512  512 ... -512 -512 -512]


### Data Preprocessing ###

#### Remove corrupted/incomplete traces


In [34]:
def clean(X1, X2, y=None):
    X1_clean, X2_clean, y_clean = [], [], []
    for i in range(len(X1)):
        if len(X1[i]) > 0 and len(X1[i]) == len(X2[i]): # non-empty & matching lengths
            X1_clean.append(X1[i])
            X2_clean.append(X2[i])
            if y is not None:
                y_clean.append(y[i])
    return (X1_clean, X2_clean, y_clean) if y is not None else (X1_clean, X2_clean)

# clean monitored
X1_mon, X2_mon, y_mon = clean(X1_mon, X2_mon, y_mon)
print("Clean monitored traces:", len(X1_mon))

# clean unmonitored
X1_unmon, X2_unmon = clean(X1_unmon, X2_unmon)
print("Clean unmonitored traces:", len(X1_unmon))

Clean monitored traces: 1900
Clean unmonitored traces: 300


#### Normalize timestamps to start at 0

In [35]:
def normalize_timestamps(X1):
    return [[t - seq[0] for t in seq] for seq in X1] # subtract by first seq value for each value to see how much time passed in each packet

X1_mon = normalize_timestamps(X1_mon)
X1_unmon = normalize_timestamps(X1_unmon)

#### Truncate or pad sequences to certain length

In [36]:
import numpy as np

MAX_LEN = 10000

def pad_truncate(seq, max_len=10000):
    seq = list(seq)  # assure une liste Python

    if len(seq) > max_len:
        return seq[:max_len]

    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))

    return seq

# --- MONITORED --------------------------------------------------------------
X1_mon = np.array([pad_truncate(s, MAX_LEN) for s in X1_mon])
X2_mon = np.array([pad_truncate(s, MAX_LEN) for s in X2_mon])
y_mon = np.array(y_mon)

# --- UNMONITORED ------------------------------------------------------------
X1_unmon = np.array([pad_truncate(s, MAX_LEN) for s in X1_unmon])
X2_unmon = np.array([pad_truncate(s, MAX_LEN) for s in X2_unmon])

print("\nMonitored timestamps:", X1_mon.shape)
print("Unmonitored timestamps:", X1_unmon.shape)



Monitored timestamps: (1900, 10000)
Unmonitored timestamps: (300, 10000)


#### Split data into training, testing, and validation datasets

In [37]:
import numpy as np
from sklearn.model_selection import train_test_split

# 1) SCENARIO CLOSED WORLD

X1_train_cw, X1_temp_cw, X2_train_cw, X2_temp_cw, y_train_cw, y_temp_cw = train_test_split(
    X1_mon, X2_mon, y_mon,
    test_size=0.30,
    stratify=y_mon,
    random_state=42
)

X1_val_cw, X1_test_cw, X2_val_cw, X2_test_cw, y_val_cw, y_test_cw = train_test_split(
    X1_temp_cw, X2_temp_cw, y_temp_cw,
    test_size=0.50,
    stratify=y_temp_cw,
    random_state=42
)


X_train_cw = np.hstack((X1_train_cw, X2_train_cw))
X_val_cw   = np.hstack((X1_val_cw,   X2_val_cw))
X_test_cw  = np.hstack((X1_test_cw,  X2_test_cw))


# 2) SCENARIO OPEN WORLD (binary 0/1)

y_mon_binary = np.ones(len(X1_mon))        # SurveillÃ© = 1
y_unmon_binary = np.zeros(len(X1_unmon))   # Non-surveillÃ© = 0

# Combine les donnÃ©es
X1_all = np.vstack((X1_mon, X1_unmon))
X2_all = np.vstack((X2_mon, X2_unmon))
y_all  = np.concatenate((y_mon_binary, y_unmon_binary))

# Split open world
X1_train_ow, X1_temp_ow, X2_train_ow, X2_temp_ow, y_train_ow, y_temp_ow = train_test_split(
    X1_all, X2_all, y_all,
    test_size=0.30,
    stratify=y_all,
    random_state=42
)

X1_val_ow, X1_test_ow, X2_val_ow, X2_test_ow, y_val_ow, y_test_ow = train_test_split(
    X1_temp_ow, X2_temp_ow, y_temp_ow,
    test_size=0.50,
    stratify=y_temp_ow,
    random_state=42
)

# fusion embedding
X_train_ow = np.hstack((X1_train_ow, X2_train_ow))
X_val_ow   = np.hstack((X1_val_ow,   X2_val_ow))
X_test_ow  = np.hstack((X1_test_ow,  X2_test_ow))


## **KNN MODEL** (Alice)

In [38]:
from sklearn.preprocessing import StandardScaler

# --- Closed World Scaler ---
scaler_cw = StandardScaler()

# Fit only on training data
scaler_cw.fit(X_train_cw)

# Transform
X_train_cw_scaled = scaler_cw.transform(X_train_cw)
X_val_cw_scaled   = scaler_cw.transform(X_val_cw)
X_test_cw_scaled  = scaler_cw.transform(X_test_cw)

# --- Open World Scaler ---
scaler_ow = StandardScaler()

# Fit only on training data
scaler_ow.fit(X_train_ow)

# Transform
X_train_ow_scaled = scaler_ow.transform(X_train_ow)
X_val_ow_scaled   = scaler_ow.transform(X_val_ow)
X_test_ow_scaled  = scaler_ow.transform(X_test_ow)

In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

print("=== CLOSED WORLD====")

# KNN model
knn_cw = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)

# Train
knn_cw.fit(X_train_cw_scaled, y_train_cw)

# Validation accuracy
y_train_pred_cw = knn_cw.predict(X_train_cw_scaled)
y_val_pred_cw  = knn_cw.predict(X_val_cw_scaled)
y_test_pred_cw = knn_cw.predict(X_test_cw_scaled)


# Test accuracy
train_acc_cw = accuracy_score(y_train_cw, y_train_pred_cw)
val_acc_cw   = accuracy_score(y_val_cw, y_val_pred_cw)
test_acc_cw  = accuracy_score(y_test_cw, y_test_pred_cw)


print("Train accuracy Â  Â  Â  :", train_acc_cw)
print("Validation accuracy Â :", val_acc_cw)
print("Test accuracy Â  Â  Â  Â :", test_acc_cw)

=== CLOSED WORLD====
Train accuracy Â  Â  Â  : 0.7894736842105263
Validation accuracy Â : 0.6736842105263158
Test accuracy Â  Â  Â  Â : 0.6701754385964912


### Closed world K-NN Hyperparameter Tuning

In [48]:
from sklearn.model_selection import GridSearchCV

print("\n--- CLOSED WORLD K-NN OPTIMIZATION (GRID SEARCH) ---")

# 1. Define parameters to test
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],       # Test different neighbors
    'weights': ['uniform', 'distance'] # Test weight modes
}

# 2. Run the search (cv=3 for faster execution)
grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=3, n_jobs=-1, verbose=1)
grid_knn.fit(X_train_cw_scaled, y_train_cw)

# 3. Retrieve the best model
best_knn_cw = grid_knn.best_estimator_
tuned_acc_cw = best_knn_cw.score(X_test_cw_scaled, y_test_cw)

print(f"\nBest parameters found    : {grid_knn.best_params_}")
print(f"Accuracy WITHOUT tuning  : {test_acc_cw:.4f}")
print(f"Accuracy WITH tuning     : {tuned_acc_cw:.4f}")

# 4. Display the gain
gain = tuned_acc_cw - test_acc_cw
if gain > 0:
    print(f"IMPROVEMENT: +{gain:.4f}")
else:
    print(f"ðŸ”¹ No significant improvement (default model was already good).")


--- CLOSED WORLD K-NN OPTIMIZATION (GRID SEARCH) ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Best parameters found    : {'n_neighbors': 3, 'weights': 'distance'}
Accuracy WITHOUT tuning  : 0.6737
Accuracy WITH tuning     : 0.6807
IMPROVEMENT: +0.0070


In [40]:
print("\n=== OPEN WORLD (binary 0=unmon, 1=mon) ===")

# KNN model
knn_ow = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)

# Train
knn_ow.fit(X_train_ow_scaled, y_train_ow)

y_train_pred_ow = knn_ow.predict(X_train_ow_scaled)
y_val_pred_ow   = knn_ow.predict(X_val_ow_scaled)
y_test_pred_ow  = knn_ow.predict(X_test_ow_scaled)

# --- Calcul accuracy ---
train_acc_ow = accuracy_score(y_train_ow, y_train_pred_ow)
val_acc_ow   = accuracy_score(y_val_ow, y_val_pred_ow)
test_acc_ow  = accuracy_score(y_test_ow, y_test_pred_ow)


print("Train accuracy Â  Â  Â  :", train_acc_ow)
print("Validation accuracy Â :", val_acc_ow)
print("Test accuracy Â  Â  Â  Â :", test_acc_ow)



=== OPEN WORLD (binary 0=unmon, 1=mon) ===
Train accuracy Â  Â  Â  : 0.9227272727272727
Validation accuracy Â : 0.8757575757575757
Test accuracy Â  Â  Â  Â : 0.8636363636363636


### Open world K-NN Hyperparameter Tuning

In [49]:
from sklearn.model_selection import GridSearchCV

print("\n--- OPEN WORLD K-NN OPTIMIZATION (GRID SEARCH) ---")

# 1. Define parameters to test
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],       # Test different neighbors
    'weights': ['uniform', 'distance'] # Test weight modes
}

# 2. Run the search (cv=3 for faster execution)
# Note: We use the Open World variables (_ow) here
grid_knn_ow = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=3, n_jobs=-1, verbose=1)
grid_knn_ow.fit(X_train_ow_scaled, y_train_ow)

# 3. Retrieve the best model
best_knn_ow = grid_knn_ow.best_estimator_
tuned_acc_ow = best_knn_ow.score(X_test_ow_scaled, y_test_ow)

print(f"\nBest parameters found    : {grid_knn_ow.best_params_}")
print(f"Accuracy WITHOUT tuning  : {test_acc_ow:.4f}")
print(f"Accuracy WITH tuning     : {tuned_acc_ow:.4f}")

# 4. Display the gain
gain_ow = tuned_acc_ow - test_acc_ow
if gain_ow > 0:
    print(f"IMPROVEMENT: +{gain_ow:.4f}")
else:
    print(f"ðŸ”¹ No significant improvement (default model was already good).")


--- OPEN WORLD K-NN OPTIMIZATION (GRID SEARCH) ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Best parameters found    : {'n_neighbors': 9, 'weights': 'uniform'}
Accuracy WITHOUT tuning  : 0.8636
Accuracy WITH tuning     : 0.8909
IMPROVEMENT: +0.0273


## **SVM (Alice)**

In [51]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time

print("=== CLOSED WORLD (SVM) ====")

svm_cw = SVC(
    kernel='rbf',
    gamma='scale',
    C=1.0,
    random_state=42,
    verbose=True
)

# --- Training ---
start_time = time.time()
print("DÃ©marrage de l'entraÃ®nement...")
svm_cw.fit(X_train_cw_scaled, y_train_cw)
end_time = time.time()
print(f"Temps d'entraÃ®nement: {end_time - start_time:.2f} secondes")

# --- Predictions ---
y_train_pred_cw = svm_cw.predict(X_train_cw_scaled) # Training
y_val_pred_cw   = svm_cw.predict(X_val_cw_scaled)   # Validation
y_test_pred_cw  = svm_cw.predict(X_test_cw_scaled)  # Test

# --- Calcul accuracy ---
train_acc_cw = accuracy_score(y_train_cw, y_train_pred_cw)
val_acc_cw   = accuracy_score(y_val_cw, y_val_pred_cw)
test_acc_cw  = accuracy_score(y_test_cw, y_test_pred_cw)


print("\n--- RÃ‰SULTATS ---")
print("Train accuracy Â  Â  Â  :", train_acc_cw)
print("Validation accuracy Â :", val_acc_cw)
print("Test accuracy Â  Â  Â  Â :", test_acc_cw)

=== CLOSED WORLD (SVM) ====
DÃ©marrage de l'entraÃ®nement...
[LibSVM]Temps d'entraÃ®nement: 13.34 secondes

--- RÃ‰SULTATS ---
Train accuracy Â  Â  Â  : 0.8639097744360902
Validation accuracy Â : 0.6456140350877193
Test accuracy Â  Â  Â  Â : 0.6736842105263158


### Closed world SVM Hyperparameter Tuning

In [52]:
from sklearn.model_selection import GridSearchCV

print("\n--- CLOSED WORLD SVM OPTIMIZATION (GRID SEARCH) ---")

# 1. Define parameters to test
# We limit the grid because SVM is computationally expensive on large data
param_grid_svm = {
    'C': [0.1, 1, 10, 100], 
    'kernel': ['rbf'] 
}

# 2. Run the search
grid_svm_cw = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=3, n_jobs=-1, verbose=2)
grid_svm_cw.fit(X_train_cw_scaled, y_train_cw)

# 3. Retrieve the best model
best_svm_cw = grid_svm_cw.best_estimator_
tuned_acc_cw = best_svm_cw.score(X_test_cw_scaled, y_test_cw)

print(f"\nBest parameters found    : {grid_svm_cw.best_params_}")
print(f"Accuracy WITHOUT tuning  : {test_acc_cw:.4f}")
print(f"Accuracy WITH tuning     : {tuned_acc_cw:.4f}")

# 4. Display the gain
gain_cw = tuned_acc_cw - test_acc_cw
if gain_cw > 0:
    print(f"IMPROVEMENT: +{gain_cw:.4f}")
else:
    print(f"No significant improvement.")


--- CLOSED WORLD SVM OPTIMIZATION (GRID SEARCH) ---
Fitting 3 folds for each of 4 candidates, totalling 12 fits

Best parameters found    : {'C': 10, 'kernel': 'rbf'}
Accuracy WITHOUT tuning  : 0.6737
Accuracy WITH tuning     : 0.8526
IMPROVEMENT: +0.1789


In [42]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time

print("=== OPEN WORLD (SVM) ====")

svm_ow = SVC(
    kernel='rbf',
    gamma='scale',
    C=1.0,
    random_state=42,
    verbose=True
)

# --- Training ---
start_time = time.time()
print("DÃ©marrage de l'entraÃ®nement...")
svm_ow.fit(X_train_ow_scaled, y_train_ow)
end_time = time.time()
print(f"Temps d'entraÃ®nement: {end_time - start_time:.2f} secondes")

# --- Predictions ---
y_train_pred_ow = svm_ow.predict(X_train_ow_scaled) # Training
y_val_pred_ow   = svm_ow.predict(X_val_ow_scaled)   # Validation
y_test_pred_ow  = svm_ow.predict(X_test_ow_scaled)  # Test

# --- Calcul accuracy ---
train_acc_ow = accuracy_score(y_train_ow, y_train_pred_ow)
val_acc_ow   = accuracy_score(y_val_ow, y_val_pred_ow)
test_acc_ow  = accuracy_score(y_test_ow, y_test_pred_ow)


print("\n--- RÃ‰SULTATS ---")
print("Train accuracy Â  Â  Â  :", train_acc_ow)
print("Validation accuracy Â :", val_acc_ow)
print("Test accuracy Â  Â  Â  Â :", test_acc_ow)


=== OPEN WORLD (SVM) ====
DÃ©marrage de l'entraÃ®nement...
[LibSVM]Temps d'entraÃ®nement: 43.35 secondes

--- RÃ‰SULTATS ---
Train accuracy Â  Â  Â  : 0.8974025974025974
Validation accuracy Â : 0.8666666666666667
Test accuracy Â  Â  Â  Â : 0.8636363636363636


### Open world SVM Hyperparameter Tuning

In [53]:
from sklearn.model_selection import GridSearchCV

print("\n--- OPEN WORLD SVM OPTIMIZATION (GRID SEARCH) ---")
print("Warning: This process might be slow...")

# 1. Define parameters to test
# We use a limited grid to keep computation time reasonable
param_grid_svm = {
    'C': [0.1, 1, 10],   # Regularization parameter
    'kernel': ['rbf']    # RBF is usually best for this task
}

# 2. Run the search
# cv=3 is used for speed. n_jobs=-1 uses all CPU cores.
grid_svm_ow = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=3, n_jobs=-1, verbose=2)
grid_svm_ow.fit(X_train_ow_scaled, y_train_ow)

# 3. Retrieve the best model
best_svm_ow = grid_svm_ow.best_estimator_
tuned_acc_ow = best_svm_ow.score(X_test_ow_scaled, y_test_ow)

print(f"\nBest parameters found    : {grid_svm_ow.best_params_}")
print(f"Accuracy WITHOUT tuning  : {test_acc_ow:.4f}")
print(f"Accuracy WITH tuning     : {tuned_acc_ow:.4f}")

# 4. Display the gain
gain_ow = tuned_acc_ow - test_acc_ow
if gain_ow > 0:
    print(f"IMPROVEMENT: +{gain_ow:.4f}")
else:
    print(f"No significant improvement.")


--- OPEN WORLD SVM OPTIMIZATION (GRID SEARCH) ---
Fitting 3 folds for each of 3 candidates, totalling 9 fits

Best parameters found    : {'C': 10, 'kernel': 'rbf'}
Accuracy WITHOUT tuning  : 0.8636
Accuracy WITH tuning     : 0.9333
IMPROVEMENT: +0.0697
