1. mon_standard.pkl > array code



In [10]:
import pickle

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 95

# Load the pickle file
print("Loading datafile...")
with open("mon_standard.pkl", 'rb') as fi: # Path to mon_standard.pkl in Colab
    data = pickle.load(fi)

X1_mon = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2_mon = [] # Array to store instances (direction*size) - size information
y_mon = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1_mon.append(time_seq)
        X2_mon.append(size_seq)
        y_mon.append(label)
size = len(y_mon)

print(f'Total samples: {size}') # Output: 19000


Loading datafile...
Total samples: 1900


2. unmon_standard10.pkl > array code

In [11]:
import pickle
import numpy as np

TOTAL_URLS = 300  # total number in the dataset

# Load 10,000 unmon pickle file
print("Loading datafile...")
with open('unmon_standard10_3000.pkl', 'rb') as f:  # Path to unmon_standard10.pkl in Colab
    x = pickle.load(f)

size = len(x)
print(f'Total samples: {size}')

X1_unmon = [] # Array to store instances (timestamps) - 10,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2_unmon = [] # Array to store instances (direction*size) - size information

for i in range(TOTAL_URLS):
    size_seq = []
    time_seq = []
    for c in x[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
    X1_unmon.append(np.array(time_seq, dtype=np.int32))
    X2_unmon.append(np.array(size_seq, dtype=np.int16))


print(len(X1_unmon)) # Print the length of X1

print(X2_unmon[0-10])

Loading datafile...
Total samples: 3000
300
[-512 -512  512 ... -512 -512 -512]


### Data Preprocessing ###

#### Remove corrupted/incomplete traces


In [12]:
def clean(X1, X2, y=None):
    X1_clean, X2_clean, y_clean = [], [], []
    for i in range(len(X1)):
        if len(X1[i]) > 0 and len(X1[i]) == len(X2[i]): # non-empty & matching lengths
            X1_clean.append(X1[i])
            X2_clean.append(X2[i])
            if y is not None:
                y_clean.append(y[i])
    return (X1_clean, X2_clean, y_clean) if y is not None else (X1_clean, X2_clean)

# clean monitored
X1_mon, X2_mon, y_mon = clean(X1_mon, X2_mon, y_mon)
print("Clean monitored traces:", len(X1_mon))

# clean unmonitored
X1_unmon, X2_unmon = clean(X1_unmon, X2_unmon)
print("Clean unmonitored traces:", len(X1_unmon))

Clean monitored traces: 1900
Clean unmonitored traces: 300


#### Normalize timestamps to start at 0

In [13]:
def normalize_timestamps(X1):
    return [[t - seq[0] for t in seq] for seq in X1] # subtract by first seq value for each value to see how much time passed in each packet

X1_mon = normalize_timestamps(X1_mon)
X1_unmon = normalize_timestamps(X1_unmon)

#### Truncate or pad sequences to certain length

In [14]:
import numpy as np

MAX_LEN = 10000

def pad_truncate(seq, max_len=10000):
    seq = list(seq)  # assure une liste Python

    if len(seq) > max_len:
        return seq[:max_len]

    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))

    return seq

# --- MONITORED --------------------------------------------------------------
X1_mon = np.array([pad_truncate(s, MAX_LEN) for s in X1_mon])
X2_mon = np.array([pad_truncate(s, MAX_LEN) for s in X2_mon])
y_mon = np.array(y_mon)

# --- UNMONITORED ------------------------------------------------------------
X1_unmon = np.array([pad_truncate(s, MAX_LEN) for s in X1_unmon])
X2_unmon = np.array([pad_truncate(s, MAX_LEN) for s in X2_unmon])

print("\nMonitored timestamps:", X1_mon.shape)
print("Unmonitored timestamps:", X1_unmon.shape)



Monitored timestamps: (1900, 10000)
Unmonitored timestamps: (300, 10000)


#### Split data into training, testing, and validation datasets

In [15]:
from sklearn.model_selection import train_test_split

# first, split data into test/validation (30%) and train (70%)

X1_train, X1_temp, X2_train, X2_temp, y_train, y_temp = train_test_split(
    X1_mon, X2_mon, y_mon,
    test_size=0.30,
    stratify=y_mon, # use stratified splitting to preserve class balance, especially since we are doing multi-class classification
    random_state=42
)

# then, split data into test (15%) and validation (15%)

X1_val, X1_test, X2_val, X2_test, y_val, y_test = train_test_split(
    X1_temp, X2_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

# check to see that datasets were split correctly
print("Training:", len(X1_train))
print("Validation:", len(X1_val))
print("Testing:", len(X1_test))

Training: 1330
Validation: 285
Testing: 285


## **KNN MODEL**

In [16]:
import numpy as np

# concatène X1 et X2 sur l’axe des features
X_train = np.hstack([X1_train, X2_train])
X_val   = np.hstack([X1_val,   X2_val])
X_test  = np.hstack([X1_test,  X2_test])

print("Train shape :", X_train.shape)
print("Val shape   :", X_val.shape)
print("Test shape  :", X_test.shape)

Train shape : (1330, 20000)
Val shape   : (285, 20000)
Test shape  : (285, 20000)


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(
    n_neighbors=3,    # tu peux tuner ensuite
    weights='distance'
)

knn.fit(X_train_scaled, y_train)

In [19]:
from sklearn.metrics import accuracy_score

# prédictions
y_pred_train = knn.predict(X_train_scaled)
y_pred_val   = knn.predict(X_val_scaled)
y_pred_test  = knn.predict(X_test_scaled)

print("Train accuracy :", accuracy_score(y_train, y_pred_train))
print("Val accuracy   :", accuracy_score(y_val,   y_pred_val))
print("Test accuracy  :", accuracy_score(y_test,  y_pred_test))

Train accuracy : 1.0
Val accuracy   : 0.6771929824561403
Test accuracy  : 0.6807017543859649


Train accuracy = 1.0 (100%)
→ The model learned the training data too well. It memorized everything.

Validation accuracy ≈ 0.68

Test accuracy ≈ 0.68
→ On new data, the model is only correct about 68% of the time.

Consequently, our model is **overfitting**