1. mon_standard.pkl > array code



In [21]:
import pickle

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open("mon_standard.pkl", 'rb') as fi: # Path to mon_standard.pkl in Colab
    data = pickle.load(fi)

X1_mon = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2_mon = [] # Array to store instances (direction*size) - size information
y_mon = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1_mon.append(time_seq)
        X2_mon.append(size_seq)
        y_mon.append(label)
size = len(y_mon)

print(f'Total samples: {size}') # Output: 19000


Loading datafile...
Total samples: 19000


2. unmon_standard10.pkl > array code

In [22]:
import pickle

TOTAL_URLS = 10000  # total number in the dataset

# Load 10,000 unmon pickle file
print("Loading datafile...")
with open('unmon_standard10.pkl', 'rb') as f:  # Path to unmon_standard10.pkl in Colab
    x = pickle.load(f)

size = len(x)
print(f'Total samples: {size}')

X1_unmon = [] # Array to store instances (timestamps) - 10,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2_unmon = [] # Array to store instances (direction*size) - size information

for i in range(TOTAL_URLS):
    size_seq = []
    time_seq = []
    for c in x[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
    X1_unmon.append(time_seq)
    X2_unmon.append(size_seq)

print(len(X1_unmon)) # Print the length of X1

print(X2_unmon[0-10])

Loading datafile...
Total samples: 10000
10000
[-512, -512, 512, -512, 512, -512, 512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, 512, -512, 512, 512, -512, 512, 512, 512, -512, 512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, 512, 512, -512, 512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, 512, -512, -512, 512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, 512, 512, 512, 512, 512, -512, -512, 512, 512, -512, -512, 512, -512, -512, -512, -512, 512, 512, 512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, -512, 512, 512, 512, 512, 512, 512, -512, 512, -512, -512, -512, -512, -512, -512, -512, -512, 512, 512, -512, -512, -512, -512, -512, -512, -512, -512, -51

### Data Preprocessing ###

#### Remove corrupted/incomplete traces


In [23]:
def clean(X1, X2, y=None):
    X1_clean, X2_clean, y_clean = [], [], []
    for i in range(len(X1)):
        if len(X1[i]) > 0 and len(X1[i]) == len(X2[i]): # non-empty & matching lengths
            X1_clean.append(X1[i])
            X2_clean.append(X2[i])
            if y is not None:
                y_clean.append(y[i])
    return (X1_clean, X2_clean, y_clean) if y is not None else (X1_clean, X2_clean)

# clean monitored
X1_mon, X2_mon, y_mon = clean(X1_mon, X2_mon, y_mon)
print("Clean monitored traces:", len(X1_mon))

# clean unmonitored
X1_unmon, X2_unmon = clean(X1_unmon, X2_unmon)
print("Clean unmonitored traces:", len(X1_unmon))

Clean monitored traces: 19000
Clean unmonitored traces: 10000


#### Normalize timestamps to start at 0

In [24]:
def normalize_timestamps(X1):
    return [[t - seq[0] for t in seq] for seq in X1] # subtract by first seq value for each value to see how much time passed in each packet

X1_mon = normalize_timestamps(X1_mon)
X1_unmon = normalize_timestamps(X1_unmon)

#### Truncate or pad sequences to certain length

In [25]:
import numpy as np

# lengths of all monitored sequences
mon_lengths = [len(seq) for seq in X1_mon]

# lengths of all unmonitored sequences
unmon_lengths = [len(seq) for seq in X1_unmon]

print("Monitored percentiles:")
for p in [50, 75, 90, 95, 99]:
    print(f"{p}:", np.percentile(mon_lengths, p))

print("\nUnmonitored percentiles:")
for p in [50, 75, 90, 95, 99]:
    print(f"{p}:", np.percentile(unmon_lengths, p))

'''
Monitored percentiles:
50: 3309.0
75: 6378.0
90: 9929.0
95: 9963.0
99: 9981.0

Unmonitored percentiles:
50: 4193.0
75: 8233.25
90: 9960.0
95: 9973.0
99: 9984.0

Since at least 90% of the traces have a length > 9,900 packets, the max length should be around 10,000 just to be safe
'''

MAX_LEN = 10000

# truncates sequences if longer than 10,000, pads sequences if shorter than 10,000 with zeros
def pad_truncate(seq, max_len=MAX_LEN):
    seq = seq[:max_len] # truncate

    if len(seq) < max_len: # pad
        seq = seq + [0] * (max_len - len(seq))

    return seq

# new padded/truncated monitored traces
X1_mon = np.array([pad_truncate(s) for s in X1_mon])
X2_mon = np.array([pad_truncate(s) for s in X2_mon])
y_mon = np.array(y_mon)

# new padded/truncated unmonitored traces
X1_unmon = np.array([pad_truncate(s) for s in X1_unmon])
X2_unmon = np.array([pad_truncate(s) for s in X2_unmon])

# check to see that both have length of 10,000
print("\nMonitored timestamps:", X1_mon.shape) # should be (19000 traces, 10000 length)
print("Unmonitored timestamps:", X1_unmon.shape) # should be (1000 traces, 10000 length)

Monitored percentiles:
50: 3309.0
75: 6378.0
90: 9929.0
95: 9963.0
99: 9981.0

Unmonitored percentiles:
50: 4193.0
75: 8233.25
90: 9960.0
95: 9973.0
99: 9984.0

Monitored timestamps: (19000, 10000)
Unmonitored timestamps: (10000, 10000)


#### Split data into training, testing, and validation datasets

In [26]:
from sklearn.model_selection import train_test_split

# first, split data into test/validation (30%) and train (70%)

X1_train, X1_temp, X2_train, X2_temp, y_train, y_temp = train_test_split(
    X1_mon, X2_mon, y_mon,
    test_size=0.30,
    stratify=y_mon, # use stratified splitting to preserve class balance, especially since we are doing multi-class classification
    random_state=42
)

# then, split data into test (15%) and validation (15%)

X1_val, X1_test, X2_val, X2_test, y_val, y_test = train_test_split(
    X1_temp, X2_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

# check to see that datasets were split correctly
print("Training:", len(X1_train))
print("Validation:", len(X1_val))
print("Testing:", len(X1_test))

Training: 13300
Validation: 2850
Testing: 2850
