In [1]:
# =============================================================================
# 0  Imports, RNG, paths
# =============================================================================
from pathlib import Path
import random, math, itertools, time

import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from hmmlearn import hmm
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import torch.nn.functional as F

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

RAW_PATH   = Path('real_data/df_raw_UDP_GOOGLE_HOME.csv')
MODELS_DIR = Path('models')
DATA_DIR   = Path('synth_data')

MODELS_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR.mkdir(parents=True,  exist_ok=True)

# =============================================================================
# 1  Load & pre‑process packets
# =============================================================================
print('\u25ba  Reading raw packets …')
df = pd.read_csv(
    RAW_PATH,
    usecols=['flow_id', 'payload_length', 'time_diff']
)

# basic cleaning
_df_invalid = (~np.isfinite(df['payload_length'])) | (~np.isfinite(df['time_diff']))
if _df_invalid.any():
    print(f'   ! dropping {_df_invalid.sum()} rows with non‑finite values')
    df = df[~_df_invalid]

# physical bounds & missing
df['payload_length'] = df['payload_length'].clip(lower=0).fillna(0)
df['time_diff']      = df['time_diff'].clip(lower=0).fillna(0)

# log‑transform payload so we can model support (0, +∞)
df['payload_log'] = np.log1p(df['payload_length'])
# Get rid of rows that has time_diff > 25 miliseconds
df = df[df['time_diff'] < 25e-3] 

# columns to be modelled (before z‑score)
cols_raw = ['payload_log', 'time_diff']

# Discard flows with only 1 packet
flow_sizes = df.groupby('flow_id').size()
valid_flows = flow_sizes[flow_sizes > 1].index
#df = df[df['flow_id'].isin(valid_flows)]

# -----------------------------------------------------------------------------
# 1.1   train / test split (by *flow*, never by packet)
flow_ids          = df['flow_id'].unique()
train_flows, test_flows = train_test_split(flow_ids, test_size=0.10, random_state=SEED)
test_df = df[df['flow_id'].isin(test_flows)].copy()

# import two datasets for comparison from /comparison folder
necstgen_df = pd.read_csv('comparison/DF_GEN_FLOW_UDP_GOOGLE_HOME_FINAL.csv')
hmmmdn_df = pd.read_csv('comparison/synthetic_packets_hmmmdn_global_UDP_GOGL.csv')


►  Reading raw packets …


In [2]:

# randomly sample rows from necstgen_df at the size of test_df
necstgen_df = necstgen_df.sample(n=len(test_df), random_state=SEED)
hmmmdn_df = hmmmdn_df.sample(n=len(test_df), random_state=SEED)


In [3]:
stat_necstgen_dt, p_necstgen_dt = ks_2samp(test_df['time_diff'],      necstgen_df['time_diff'])
stat_hmmmdn_dt, p_hmmmdn_dt = ks_2samp(test_df['time_diff'],      hmmmdn_df['time_diff'])
print(f'   time_diff      : KS stat NeCSTgen= {stat_necstgen_dt:.4f}, p‑value = {p_necstgen_dt:.4g}')
print(f'   time_diff      : KS stat HMM-MDN= {stat_hmmmdn_dt:.4f}, p‑value = {p_hmmmdn_dt:.4g}')


   time_diff      : KS stat NeCSTgen= 0.8742, p‑value = 3.119e-294
   time_diff      : KS stat HMM-MDN= 0.1962, p‑value = 7.466e-13


In [4]:
# import two datasets for comparison from /comparison folder
necstgen_df_2 = pd.read_csv('comparison/DF_GEN_PACKET_UDP_GOOGLE_HOME_FLOW_NO_CONNECT_FINAL.csv')

In [5]:
# randomly sample rows from necstgen_df at the size of test_df
necstgen_df_2 = necstgen_df_2.sample(n=len(test_df), random_state=SEED)
stat_necstgen_2_dt_vae, p_necstgen_2_dt_vae = ks_2samp(test_df['time_diff'],      necstgen_df_2['time_diff_vae'])
stat_necstgen_2_dt_lstm, p_necstgen_2_dt_lstm = ks_2samp(test_df['time_diff'],      necstgen_df_2['time_diff_lstm'])
print(f'   time_diff      : KS stat NeCSTgen VAE= {stat_necstgen_2_dt_vae:.4f}, p‑value = {p_necstgen_2_dt_vae:.4g}')
print(f'   time_diff      : KS stat NeCSTgen LSTM= {stat_necstgen_2_dt_lstm:.4f}, p‑value = {p_necstgen_2_dt_lstm:.4g}')

   time_diff      : KS stat NeCSTgen VAE= 0.3383, p‑value = 7.29e-38
   time_diff      : KS stat NeCSTgen LSTM= 0.6806, p‑value = 2.434e-163


In [6]:
# now the payload KS comparisons
stat_necstgen_2_pl_vae, p_necstgen_2_pl_vae = ks_2samp(test_df['payload_length'], necstgen_df_2['payload_length_all_vae'])
stat_necstgen_2_pl_lstm, p_necstgen_2_pl_lstm = ks_2samp(test_df['payload_length'], necstgen_df_2['length_total_lstm'])
stat_hmmmdn_pl, p_hmmmdn_pl = ks_2samp(test_df['payload_length'],      hmmmdn_df['payload_length'])
print(f'   payload_length : KS stat NeCSTgen VAE= {stat_necstgen_2_pl_vae:.4f},  p‑value = {p_necstgen_2_pl_vae:.4g}')
print(f'   payload_length : KS stat NeCSTgen LSTM= {stat_necstgen_2_pl_lstm:.4f},  p‑value = {p_necstgen_2_pl_lstm:.4g}')
print(f'   payload_length : KS stat HMM-MDN= {stat_hmmmdn_pl:.4f}, p‑value = {p_hmmmdn_pl:.4g}')

   payload_length : KS stat NeCSTgen VAE= 0.2300,  p‑value = 1.499e-17
   payload_length : KS stat NeCSTgen LSTM= 0.4533,  p‑value = 8.526e-69
   payload_length : KS stat HMM-MDN= 0.1407, p‑value = 8.47e-07
