## Closed world : Decision tree


In [2]:
import pickle
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("Loading monitored datafile...")
with open("/content/sample_data/mon_standard.pkl", "rb") as f:
    monitored_data = pickle.load(f)

# Use top 1,000 samples
USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS = 95
MAX_SAMPLES = 1000

all_samples = []
all_labels = []

for i, site_data in enumerate(monitored_data.values()):
    label = i // URL_PER_SITE if not USE_SUBLABEL else i
    for sample in site_data[:MAX_SAMPLES // TOTAL_URLS]:
        all_samples.append(sample)
        all_labels.append(label)

#feature
X_features = []
for sample in all_samples:
    # cumulative_sizes
    cumulative_sizes = np.cumsum([val * 512 for val in sample])
    # packet_sizes
    packet_sizes = [val * 512 for val in sample]
    # time stamp
    transmission_times = [abs(val) for val in sample]

    # feature combination
    combined_features = np.concatenate(
        [cumulative_sizes, packet_sizes, transmission_times]
    )
    X_features.append(combined_features)

# padding
max_length = max(len(sample) for sample in X_features)
X_padded = pad_sequences(X_features, maxlen=max_length, padding='post', truncating='post')

# labeling
y = np.array(all_labels)

# scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_padded)

# data split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# model training
print("\nClosed-world Multi-class Classification:")
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

# prediction and accuracy
y_pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=1))

Loading monitored datafile...

Closed-world Multi-class Classification:
Accuracy: 31.53%

Classification Report:
              precision    recall  f1-score   support

           0       0.08      0.05      0.06        20
           1       0.14      0.15      0.15        20
           2       0.33      0.25      0.29        20
           3       0.21      0.15      0.18        20
           4       0.38      0.40      0.39        20
           5       0.15      0.10      0.12        20
           6       0.30      0.30      0.30        20
           7       0.29      0.45      0.35        20
           8       0.39      0.45      0.42        20
           9       0.06      0.10      0.08        20
          10       0.07      0.10      0.09        20
          11       0.31      0.25      0.28        20
          12       0.43      0.50      0.47        20
          13       0.00      0.00      0.00        20
          14       0.38      0.30      0.33        20
          15       0.1

## Closed world : SVM

In [1]:
import pickle
import numpy as np

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950
MAX_LENGTH = 500


print("Loading datafile...")
with open("/content/sample_data/mon_standard.pkl", 'rb') as fi:
    data = pickle.load(fi)

def process_sample(sample):
    size_seq = np.array([1 if c > 0 else -1 for c in sample], dtype=np.int16) * 512
    time_seq = np.abs(sample).astype(np.float32)
    cumulative_sizes = np.cumsum(size_seq)
    return time_seq, size_seq, cumulative_sizes

def pad_sequences(sequences, maxlen):
    padded_sequences = np.zeros((len(sequences), maxlen))
    for i, seq in enumerate(sequences):
        padded_sequences[i, :len(seq)] = seq[:maxlen]
    return np.array(padded_sequences)


X1 = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2 = [] # Array to store instances (direction*size) - size information
X3 = []
y = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE

    for sample in data[i]:
        time_seq, size_seq, cumulative_sizes = process_sample(sample)
        X1.append(time_seq)
        X2.append(size_seq)
        X3.append(cumulative_sizes)
        y.append(label)

X1_padded = pad_sequences(X1, MAX_LENGTH).astype(np.float32)
X2_padded = pad_sequences(X2, MAX_LENGTH).astype(np.float32)
X3_padded = pad_sequences(X3, MAX_LENGTH).astype(np.float32)

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif

# feature combination
X = np.hstack((X1_padded, X2_padded,X3_padded))
y = np.array(y)

# data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm = SVC(kernel="linear",random_state=42, C=1)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Test set accuracy:", accuracy)

Loading datafile...
Test set accuracy: 0.7255263157894737


## Closed world : Tree ensemble

In [1]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS = 950

print("Loading data...")
with open("/content/sample_data/mon_standard.pkl", "rb") as file:
    data = pickle.load(file)

X_timestamps = []  # Packet timestamps
X_packet_sizes = []  # Packet sizes
X_cum_sizes = []  # Cumulative packet size
y_monitored = []  # Labels

for i in range(TOTAL_URLS):
    label = i if USE_SUBLABEL else i // URL_PER_SITE
    for sample in data[i]:
        timestamps = np.empty(len(sample), dtype=np.float32)
        packet_sizes = np.empty(len(sample), dtype=np.int16)
        cum_sizes = np.empty(len(sample), dtype=np.int32)

        cumulative_sum = 0
        for j, c in enumerate(sample):
            dr = 1 if c > 0 else -1
            timestamps[j] = abs(c)
            packet_sizes[j] = dr * 512
            cumulative_sum += packet_sizes[j]
            cum_sizes[j] = cumulative_sum

        X_timestamps.append(timestamps)
        X_packet_sizes.append(packet_sizes)
        X_cum_sizes.append(cum_sizes)
        y_monitored.append(label)

print(f"Total monitored samples: {len(y_monitored)}")

# padding
max_length = max(len(seq) for seq in X_timestamps)


def pad_sequences(sequences, maxlen):
    padded_sequences = np.zeros((len(sequences), maxlen), dtype=np.float32)
    for i, seq in enumerate(sequences):
        padded_sequences[i, :len(seq)] = seq[:maxlen]
    return padded_sequences


X_timestamps_padded = pad_sequences(X_timestamps, max_length)
X_packet_sizes_padded = pad_sequences(X_packet_sizes, max_length)
X_cum_sizes_padded = pad_sequences(X_cum_sizes, max_length)

# feature combination
X_combined = np.hstack((X_timestamps_padded, X_packet_sizes_padded, X_cum_sizes_padded))
y_combined = np.array(y_monitored)

# data split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined)

# Random Forest Multi-Class Classification
print("Training Closed-World Multi-Class Random Forest model...")
rf_multi_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_multi_model.fit(X_train, y_train)

multi_y_pred_rf = rf_multi_model.predict(X_test)
multi_accuracy_rf = accuracy_score(y_test, multi_y_pred_rf)

print(f"Multi-Class Classification Accuracy (Random Forest): {multi_accuracy_rf:.4f}")
print("\nMulti-Class Classification Report (Random Forest):")
print(classification_report(y_test, multi_y_pred_rf))


Loading data...
Total monitored samples: 19000
Training Closed-World Multi-Class Random Forest model...
Multi-Class Classification Accuracy (Random Forest): 0.9079

Multi-Class Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.94      0.80      0.86        40
           1       1.00      1.00      1.00        40
           2       1.00      0.95      0.97        40
           3       0.90      0.95      0.93        40
           4       0.95      0.97      0.96        40
           5       0.94      0.82      0.88        40
           6       0.93      0.93      0.93        40
           7       0.84      0.95      0.89        40
           8       0.95      0.90      0.92        40
           9       0.90      0.88      0.89        40
          10       0.94      0.82      0.88        40
          11       0.97      0.95      0.96        40
          12       0.93      0.95      0.94        40
          13       0.86    

## Open world : Decision Tree


In [1]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS = 950

print("Loading data...")
with open("/content/sample_data/mon_standard.pkl", "rb") as file:
    data = pickle.load(file)

X_timestamps = []  # Packet timestamps
X_packet_sizes = []  # Packet sizes
X_cum_sizes = []  # Cumulative packet size
y_monitored = []  # Labels

for i in range(TOTAL_URLS):
    label = i if USE_SUBLABEL else i // URL_PER_SITE
    for sample in data[i]:
        timestamps = np.empty(len(sample), dtype=np.float32)
        packet_sizes = np.empty(len(sample), dtype=np.int16)
        cum_sizes = np.empty(len(sample), dtype=np.int32)

        cumulative_sum = 0
        for j, c in enumerate(sample):
            dr = 1 if c > 0 else -1
            timestamps[j] = abs(c)
            packet_sizes[j] = dr * 512
            cumulative_sum += packet_sizes[j]
            cum_sizes[j] = cumulative_sum

        X_timestamps.append(timestamps)
        X_packet_sizes.append(packet_sizes)
        X_cum_sizes.append(cum_sizes)
        y_monitored.append(label)

print(f"Total monitored samples: {len(y_monitored)}")

TOTAL_UNMON_URLS = 3000

print("Loading unmonitored datafile...")
with open("/content/sample_data/unmon_standard10_3000.pkl", "rb") as f:
    x = pickle.load(f)

for i in range(TOTAL_UNMON_URLS):
    sample = x[i]
    timestamps = np.empty(len(sample), dtype=np.float32)
    packet_sizes = np.empty(len(sample), dtype=np.int16)
    cum_sizes = np.empty(len(sample), dtype=np.int32)

    cumulative_sum = 0
    for j, c in enumerate(sample):
        dr = 1 if c > 0 else -1
        timestamps[j] = abs(c)
        packet_sizes[j] = dr * 512
        cumulative_sum += packet_sizes[j]
        cum_sizes[j] = cumulative_sum

    X_timestamps.append(timestamps)
    X_packet_sizes.append(packet_sizes)
    X_cum_sizes.append(cum_sizes)
    y_monitored.append(-1)  # Label unmonitored data as -1

print(f"Total combined samples: {len(y_monitored)}")

# padding
max_length = max(len(seq) for seq in X_timestamps)

def pad_sequences(sequences, maxlen):
    padded_sequences = np.zeros((len(sequences), maxlen), dtype=np.float32)
    for i, seq in enumerate(sequences):
        padded_sequences[i, :len(seq)] = seq[:maxlen]
    return padded_sequences

X_timestamps_padded = pad_sequences(X_timestamps, max_length)
X_packet_sizes_padded = pad_sequences(X_packet_sizes, max_length)
X_cum_sizes_padded = pad_sequences(X_cum_sizes, max_length)

# feature combination
X_combined = np.hstack((X_timestamps_padded, X_packet_sizes_padded, X_cum_sizes_padded))

# data split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_monitored, test_size=0.2, random_state=42)

# Binary Classification (Monitored vs Unmonitored)
binary_y_train = np.where(np.array(y_train) == -1, -1, 1)  # Monitored(1), Unmonitored(-1)
binary_y_test = np.where(np.array(y_test) == -1, -1, 1)

# Decision Tree Binary Classification
print("Training Open-World Binary Decision Tree model...")
dt_binary_model = DecisionTreeClassifier(random_state=42, class_weight="balanced")
dt_binary_model.fit(X_train, binary_y_train)

binary_y_pred_dt = dt_binary_model.predict(X_test)
binary_accuracy_dt = accuracy_score(binary_y_test, binary_y_pred_dt)

print(f"Binary Classification Accuracy (Decision Tree): {binary_accuracy_dt:.4f}")
print("\nBinary Classification Report (Decision Tree):")
print(classification_report(binary_y_test, binary_y_pred_dt))

# Multi-Class Classification (Monitored and Unmonitored)
print("Training Open-World Multi-Class Decision Tree model...")
dt_multi_model = DecisionTreeClassifier(random_state=42, class_weight="balanced")
dt_multi_model.fit(X_train, y_train)

multi_y_pred_dt = dt_multi_model.predict(X_test)
multi_accuracy_dt = accuracy_score(y_test, multi_y_pred_dt)

print(f"Multi-Class Classification Accuracy (Decision Tree): {multi_accuracy_dt:.4f}")
print("\nMulti-Class Classification Report (Decision Tree):")
print(classification_report(y_test, multi_y_pred_dt))


Loading data...
Total monitored samples: 19000
Loading unmonitored datafile...
Total combined samples: 22000
Training Open-World Binary Decision Tree model...
Binary Classification Accuracy (Decision Tree): 0.9998

Binary Classification Report (Decision Tree):
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       590
           1       1.00      1.00      1.00      3810

    accuracy                           1.00      4400
   macro avg       1.00      1.00      1.00      4400
weighted avg       1.00      1.00      1.00      4400

Training Open-World Multi-Class Decision Tree model...
Multi-Class Classification Accuracy (Decision Tree): 0.8473

Multi-Class Classification Report (Decision Tree):
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       590
           0       0.84      0.70      0.76        30
           1       0.91      0.89      0.90        44
           2       0.83     

## Open world : Random Forest

In [1]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS = 950

print("Loading monitored datafile...")
with open("/content/sample_data/mon_standard.pkl", "rb") as file:
    data = pickle.load(file)

X_timestamps = []  # Packet timestamps
X_packet_sizes = []  # Packet sizes
X_cum_sizes = []  # Cumulative packet size
y_monitored = []  # Labels

for i in range(TOTAL_URLS):
    label = i if USE_SUBLABEL else i // URL_PER_SITE
    for sample in data[i]:
        timestamps = np.empty(len(sample), dtype=np.float32)
        packet_sizes = np.empty(len(sample), dtype=np.int16)
        cum_sizes = np.empty(len(sample), dtype=np.int32)

        cumulative_sum = 0
        for j, c in enumerate(sample):
            dr = 1 if c > 0 else -1
            timestamps[j] = abs(c)
            packet_sizes[j] = dr * 512
            cumulative_sum += packet_sizes[j]
            cum_sizes[j] = cumulative_sum

        X_timestamps.append(timestamps)
        X_packet_sizes.append(packet_sizes)
        X_cum_sizes.append(cum_sizes)
        y_monitored.append(label)

print(f"Total monitored samples: {len(y_monitored)}")

TOTAL_UNMON_URLS = 3000

print("Loading unmonitored datafile...")
with open("/content/sample_data/unmon_standard10_3000.pkl", "rb") as f:
    x = pickle.load(f)

for i in range(TOTAL_UNMON_URLS):
    sample = x[i]
    timestamps = np.empty(len(sample), dtype=np.float32)
    packet_sizes = np.empty(len(sample), dtype=np.int16)
    cum_sizes = np.empty(len(sample), dtype=np.int32)

    cumulative_sum = 0
    for j, c in enumerate(sample):
        dr = 1 if c > 0 else -1
        timestamps[j] = abs(c)
        packet_sizes[j] = dr * 512
        cumulative_sum += packet_sizes[j]
        cum_sizes[j] = cumulative_sum

    X_timestamps.append(timestamps)
    X_packet_sizes.append(packet_sizes)
    X_cum_sizes.append(cum_sizes)
    y_monitored.append(-1)  # Label unmonitored data as -1

print(f"Total combined samples: {len(y_monitored)}")

# padding
max_length = max(len(seq) for seq in X_timestamps)

def pad_sequences(sequences, maxlen):
    padded_sequences = np.zeros((len(sequences), maxlen), dtype=np.float32)
    for i, seq in enumerate(sequences):
        padded_sequences[i, :len(seq)] = seq[:maxlen]
    return padded_sequences

X_timestamps_padded = pad_sequences(X_timestamps, max_length)
X_packet_sizes_padded = pad_sequences(X_packet_sizes, max_length)
X_cum_sizes_padded = pad_sequences(X_cum_sizes, max_length)

# feature combination
X_combined = np.hstack((X_timestamps_padded, X_packet_sizes_padded, X_cum_sizes_padded))

# data split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_monitored, test_size=0.2, random_state=42)

# Binary Classification (Monitored vs Unmonitored)
binary_y_train = np.where(np.array(y_train) == -1, -1, 1)  # Monitored(1), Unmonitored(-1)
binary_y_test = np.where(np.array(y_test) == -1, -1, 1)

# Random Forest Binary Classification
print("Training Open-World Binary Random Forest model...")
rf_binary_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_binary_model.fit(X_train, binary_y_train)

binary_y_pred = rf_binary_model.predict(X_test)
binary_accuracy = accuracy_score(binary_y_test, binary_y_pred)

print(f"Binary Classification Accuracy (Random Forest): {binary_accuracy:.4f}")
print("\nBinary Classification Report (Random Forest):")
print(classification_report(binary_y_test, binary_y_pred))

# Multi-Class Classification (Monitored and Unmonitored)
print("Training Open-World Multi-Class Random Forest model...")
rf_multi_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_multi_model.fit(X_train, y_train)

multi_y_pred = rf_multi_model.predict(X_test)
multi_accuracy = accuracy_score(y_test, multi_y_pred)

print(f"Multi-Class Classification Accuracy (Random Forest): {multi_accuracy:.4f}")
print("\nMulti-Class Classification Report (Random Forest):")
print(classification_report(y_test, multi_y_pred))


Loading monitored datafile...
Total monitored samples: 19000
Loading unmonitored datafile...
Total combined samples: 22000
Training Open-World Binary Random Forest model...
Binary Classification Accuracy (Random Forest): 0.9932

Binary Classification Report (Random Forest):
              precision    recall  f1-score   support

          -1       1.00      0.95      0.97       590
           1       0.99      1.00      1.00      3810

    accuracy                           0.99      4400
   macro avg       1.00      0.98      0.99      4400
weighted avg       0.99      0.99      0.99      4400

Training Open-World Multi-Class Random Forest model...
Multi-Class Classification Accuracy (Random Forest): 0.8925

Multi-Class Classification Report (Random Forest):
              precision    recall  f1-score   support

          -1       0.74      0.95      0.83       590
           0       0.96      0.90      0.93        30
           1       0.89      0.75      0.81        44
           2  