### DAGMM on KDDCUP99

This notebook trains **DAGMM** with **Gaussian / Laplace / Student‑t** mixture components on the **KDDCUP99** dataset.
You can switch the mixture distribution with the `dist_type` parameter below (`'gaussian'|'laplace'|'student_t'`).

In [1]:
import numpy as np 
import pandas as pd
import torch
from data_loader import *
from main import *
from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score

In [2]:
dist_type = 'gaussian'   # 'gaussian' | 'laplace' | 'student_t'
student_nu = 4.0         # only used if dist_type == 'student_t'

In [3]:
data = pd.read_csv("kddcup.data_10_percent", header=None,names=['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'type'])

In [4]:
# ==== Pre-processing ====
data.loc[data["type"] != "normal.", 'type'] = 0
data.loc[data["type"] == "normal.", 'type'] = 1

one_hot_protocol = pd.get_dummies(data["protocol_type"])
one_hot_service = pd.get_dummies(data["service"])
one_hot_flag = pd.get_dummies(data["flag"])

data = data.drop("protocol_type",axis=1)
data = data.drop("service",axis=1)
data = data.drop("flag",axis=1)
    
data = pd.concat([one_hot_protocol, one_hot_service,one_hot_flag, data],axis=1)
data.head()

Unnamed: 0,icmp,tcp,udp,IRC,X11,Z39_50,auth,bgp,courier,csnet_ns,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type
0,False,True,False,False,False,False,False,False,False,False,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,1
1,False,True,False,False,False,False,False,False,False,False,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,1
2,False,True,False,False,False,False,False,False,False,False,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,1
3,False,True,False,False,False,False,False,False,False,False,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,1
4,False,True,False,False,False,False,False,False,False,False,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,1


In [5]:
cols_to_norm = ["duration", "src_bytes", "dst_bytes", "wrong_fragment", "urgent", 
            "hot", "num_failed_logins", "num_compromised", "num_root", 
            "num_file_creations", "num_shells", "num_access_files", "count", "srv_count", 
            "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
            "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", 
            "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", 
            "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate" ]

min_cols = data.loc[data["type"]==0 , cols_to_norm].min()
max_cols = data.loc[data["type"]==0 , cols_to_norm].max()

data[cols_to_norm] = data[cols_to_norm].astype(np.float32)
data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max_cols - min_cols)


  data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max_cols - min_cols)
 4.19685930e-07 3.15846112e-07]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
  data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max_cols - min_cols)
 2.32762574e-04 2.39357513e-04]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
  data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max_cols - min_cols)
  data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max_cols - min_cols)
  data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max_cols - min_cols)
  data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max_cols - min_cols)
  data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max_cols - min_cols)
  data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max_cols - min_cols)
  data.loc[:, cols_to_norm] = (data[cols_to_norm] - min_cols) / (max

In [6]:
np.savez_compressed("kdd_cup",kdd=data.to_numpy())

In [7]:
# ==== Defining Parameters ====
class hyperparams():
    def __init__(self, config):
        self.__dict__.update(**config)
defaults = {
    'lr' : 1e-4,
    'num_epochs' : 10,
    'batch_size' : 1024,
    'gmm_k' : 4,
    'lambda_energy' : 0.1,
    'lambda_cov_diag' : 0.005,
    'pretrained_model' : None,
    'mode' : 'train',
    'use_tensorboard' : False,
    'data_path' : 'kdd_cup.npz',
    'input_dim': 118,

    'model_save_path' : './dagmm_test/models',

    'sample_step' : 194,
    'model_save_step' : 194,

    'dist_type': dist_type,     # options: 'gaussian', 'laplace', 'student_t'
    'student_nu': student_nu,           # ν value for Student's t-distribution

}

In [8]:
solver = main(hyperparams(defaults))
accuracy, precision, recall, f_score = solver.test()

100%|██████████| 194/194 [00:02<00:00, 77.55it/s]
100%|██████████| 194/194 [00:02<00:00, 82.38it/s]
100%|██████████| 194/194 [00:02<00:00, 80.95it/s]
100%|██████████| 194/194 [00:02<00:00, 76.56it/s]
100%|██████████| 194/194 [00:02<00:00, 79.24it/s]
100%|██████████| 194/194 [00:02<00:00, 71.37it/s]
100%|██████████| 194/194 [00:02<00:00, 81.68it/s]
100%|██████████| 194/194 [00:02<00:00, 72.42it/s]
100%|██████████| 194/194 [00:02<00:00, 76.02it/s]
100%|██████████| 194/194 [00:02<00:00, 79.40it/s]


Threshold : -13.707174301147461
Accuracy : 0.8835, Precision : 0.8619, Recall : 0.7692, F-score : 0.8129


In [9]:
# ==== Train ====
solver.data_loader.dataset.mode="train"
solver.dagmm.eval()
N = 0
mu_sum = 0
cov_sum = 0
gamma_sum = 0

for it, (input_data, labels) in enumerate(solver.data_loader):
    input_data = solver.to_var(input_data)
    enc, dec, z, gamma = solver.dagmm(input_data)
    phi, mu, cov = solver.dagmm.compute_gmm_params(z, gamma)
    
    batch_gamma_sum = torch.sum(gamma, dim=0)
    
    gamma_sum += batch_gamma_sum
    mu_sum += mu * batch_gamma_sum.unsqueeze(-1)
    cov_sum += cov * batch_gamma_sum.unsqueeze(-1).unsqueeze(-1)
    
    N += input_data.size(0)
    
train_phi = gamma_sum / N
train_mu = mu_sum / gamma_sum.unsqueeze(-1)
train_cov = cov_sum / gamma_sum.unsqueeze(-1).unsqueeze(-1)

In [10]:
train_energy = []
train_labels = []
train_z = []
for it, (input_data, labels) in enumerate(solver.data_loader):
    input_data = solver.to_var(input_data)
    enc, dec, z, gamma = solver.dagmm(input_data)
    sample_energy = solver.dagmm.compute_energy(z, phi=train_phi)
    cov_diag = torch.sum(1.0 / solver.dagmm.cov.diagonal(dim1=-2, dim2=-1))

    
    train_energy.append(sample_energy.data.cpu().numpy())
    train_z.append(z.data.cpu().numpy())
    train_labels.append(labels.numpy())

train_energy = np.concatenate(train_energy,axis=0)
train_z = np.concatenate(train_z,axis=0)
train_labels = np.concatenate(train_labels,axis=0)

In [11]:
# ==== Test ====
solver.data_loader.dataset.mode="test"
test_energy = []
test_labels = []
test_z = []
for it, (input_data, labels) in enumerate(solver.data_loader):
    input_data = solver.to_var(input_data)
    enc, dec, z, gamma = solver.dagmm(input_data)
    sample_energy = solver.dagmm.compute_energy(z, phi=train_phi)
    cov_diag = torch.sum(1.0 / solver.dagmm.cov.diagonal(dim1=-2, dim2=-1))

    test_energy.append(sample_energy.data.cpu().numpy())
    test_z.append(z.data.cpu().numpy())
    test_labels.append(labels.numpy())


test_energy = np.concatenate(test_energy,axis=0)
test_z = np.concatenate(test_z,axis=0)
test_labels = np.concatenate(test_labels,axis=0)

In [12]:
combined_energy = np.concatenate([train_energy, test_energy], axis=0)
combined_z = np.concatenate([train_z, test_z], axis=0)
combined_labels = np.concatenate([train_labels, test_labels], axis=0)

In [13]:
thresh = np.percentile(combined_energy, 100 - 20)
print("Threshold :", thresh)

Threshold : -12.663023948669434


In [14]:
pred = (test_energy>thresh).astype(int)
gt = test_labels.astype(int)

In [15]:
accuracy = accuracy_score(gt,pred)
precision, recall, f_score, support = prf(gt, pred, average='binary')

In [16]:
print(f"Results for {dist_type} distribution:")
print("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f}".format(accuracy,precision, recall, f_score))

Results for gaussian distribution:
Accuracy : 0.9135, Precision : 0.8996, Recall : 0.8296, F-score : 0.8632
