In [41]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import os
import pickle, itertools
from collections import defaultdict

import pandas as pd


In [42]:

class CGAN(nn.Module):
    def __init__(self, arguments, X, y):
        """Conditional Generative Adversarial Network class"""
        super(CGAN, self).__init__()

        [self.rand_noise_dim, self.tot_epochs, self.batch_size, self.D_epochs, \
         self.G_epochs, self.learning_rate, self.n_layers, self.activation, self.optimizer, self.min_num_neurons] = arguments

        self.X_train = torch.tensor(X, dtype=torch.float32)
        self.y_train = torch.tensor(y, dtype=torch.float32)

        self.label_dim = y.shape[1]
        self.x_data_dim = X.shape[1]

        self.g_losses = []
        self.d_losses, self.disc_loss_real, self.disc_loss_generated = [], [], []
        self.acc_history = []
        
        # Define the models
        self.generator = self.build_generator()
        self.discriminator = self.build_discriminator(X)

        self.optim_G = optim.Adam(self.generator.parameters(), lr=self.learning_rate)
        self.optim_D = optim.Adam(self.discriminator.parameters(), lr=self.learning_rate)

        self.criterion = nn.BCELoss()
        self.gan_name = '_'.join(str(e) for e in arguments).replace(".", "")
        
        self.terminated = False

    def build_generator(self):
        """Create the generator model"""
        layers = []
        layers.append(nn.Linear(self.rand_noise_dim + self.label_dim, self.min_num_neurons))
        layers.append(self.get_activation(self.activation))
        
        for i in range(1, self.n_layers + 1):
            layers.append(nn.Linear(self.min_num_neurons * i, self.min_num_neurons * (i + 1)))
            layers.append(self.get_activation(self.activation))

        layers.append(nn.Linear(self.min_num_neurons * (self.n_layers + 1), self.x_data_dim))
        layers.append(nn.Sigmoid())

        return nn.Sequential(*layers)

    def build_discriminator(self,x):
        """Create the discriminator model"""
        layers = []
        layers.append(nn.Linear(self.x_data_dim + self.label_dim, self.min_num_neurons * self.n_layers))
        layers.append(self.get_activation(self.activation))
        
        # for n in reversed(range(1, self.n_layers + 1)):
        #     layers.append(nn.Linear(self.min_num_neurons * n, self.min_num_neurons * (n - 1)))
        #     layers.append(self.get_activation(self.activation))
        
        # layers.append(nn.Linear(self.min_num_neurons, 1))
        # layers.append(nn.Sigmoid())

        # return nn.Sequential(*layers)
        input_dim = self.x_data_dim + self.label_dim  # 输入特征加上标签的维度
        for n in reversed(range(1, self.n_layers + 1)):
            x = nn.Linear(input_dim, self.min_num_neurons * n)(x)  # 确保输入维度正确
            x = nn.LeakyReLU(0.2)(x)
        
        x = nn.Linear(self.min_num_neurons, 1)(x)
        x = nn.Sigmoid()(x)
        return x

    def get_activation(self, activation_name):
        """Helper to get activation function"""
        if activation_name == 'relu':
            return nn.ReLU()
        elif activation_name == 'tanh':
            return nn.Tanh()
        elif activation_name == 'leakyrelu':
            return nn.LeakyReLU(0.2)
        else:
            raise ValueError(f"Activation {activation_name} not implemented")

    def get_batch(self):
        """Get a random batch of training data"""
        batch_ix = np.random.choice(len(self.X_train), size=self.batch_size, replace=False)
        return self.X_train[batch_ix], self.y_train[batch_ix]

    def dump_to_file(self, save_dir="./logs"):
        """Save the training history to a file"""
        H = defaultdict(dict)
        H["acc_history"] = self.acc_history
        H["Generator_loss"] = self.g_losses
        H["disc_loss_real"] = self.disc_loss_real
        H["disc_loss_gen"] = self.disc_loss_generated
        H["discriminator_loss"] = self.d_losses
        H["rand_noise_dim"], H["total_epochs"] = self.rand_noise_dim, self.tot_epochs
        H["batch_size"], H["learning_rate"] = self.batch_size, self.learning_rate
        H["n_layers"], H["activation"] = self.n_layers, self.activation
        H["optimizer"], H["min_num_neurons"] = self.optimizer, self.min_num_neurons

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        with open(f"{save_dir}/{self.gan_name}.pickle", "wb") as output_file:
            pickle.dump(H, output_file)

    def train(self):
        """Train the CGAN model"""
        real_labels = Variable(torch.ones(self.batch_size, 1))
        fake_labels = Variable(torch.zeros(self.batch_size, 1))

        for epoch in range(self.tot_epochs):
            # Train Discriminator
            for i in range(self.D_epochs):
                x, labels = self.get_batch()
                noise = torch.randn(self.batch_size, self.rand_noise_dim)
                generated_x = self.generator(torch.cat([noise, labels], 1))

                # Train on real and fake data
                self.optim_D.zero_grad()
                real_loss = self.criterion(self.discriminator(torch.cat([x, labels], 1)), real_labels)
                fake_loss = self.criterion(self.discriminator(torch.cat([generated_x.detach(), labels], 1)), fake_labels)
                d_loss = 0.5 * (real_loss + fake_loss)
                d_loss.backward()
                self.optim_D.step()

            # Train Generator
            for j in range(self.G_epochs):
                self.optim_G.zero_grad()
                noise = torch.randn(self.batch_size, self.rand_noise_dim)
                labels = torch.randint(0, 5, (self.batch_size, self.label_dim))
                generated_x = self.generator(torch.cat([noise, labels], 1))
                g_loss = self.criterion(self.discriminator(torch.cat([generated_x, labels], 1)), real_labels)
                g_loss.backward()
                self.optim_G.step()

            self.g_losses.append(g_loss.item())
            self.d_losses.append(d_loss.item())
            self.acc_history.append([real_loss.item(), fake_loss.item()])

            print(f"Epoch {epoch} [D loss: {d_loss.item():.4f}, G loss: {g_loss.item():.4f}]")

            if torch.isnan(d_loss) or torch.isnan(g_loss):
                self.terminated = True
                break

In [43]:
from sklearn.preprocessing import LabelEncoder, StandardScaler ,MinMaxScaler,RobustScaler, PowerTransformer, normalize, OrdinalEncoder



data_folder = "./GANs_for_Network_Intrusion_Data/Data/NSL-KDD"

def get_data( data_folder = data_folder):
    """
    Retrive Train and Test data
    """
    print(os.getcwd())  # 打印当前工作目录
    print(os.path.exists(data_folder))  # 应该输出 True

    train = pd.read_csv(data_folder+"/KDDTrain.csv")
    test = pd.read_csv(data_folder+"/KDDTest.csv")

    # 将标签进行编码
    le = LabelEncoder()
    le.fit(train.label)
    label_mapping = {l: i for i, l in enumerate(le.classes_)}

    train['label'] = le.transform(train.label)
    test['label'] = le.transform(test.label)

    cols=["protocol_type","service","flag"]
    # 对数据进行Label编码
    enc = OrdinalEncoder()
    train[cols] = enc.fit_transform(train[cols])
    test[cols] = enc.transform(test[cols])
    
    return train, test, label_mapping


In [44]:

def preprocess(x_train, x_test, data_cols, preprocessor = "StandardScaler",reject_features=False):
    """
    Scale and transform data with an option to remove highly correlated features
    """
    if reject_features :
        # profile = pandas_profiling.ProfileReport(x_train)
        # to_drop = profile.get_rejected_variables()
        to_drop = ['dst_host_srv_serror_rate','num_root','rerror_rate',
                    'serror_rate','srv_rerror_rate','srv_serror_rate']
        x_train.drop(to_drop,axis=1,inplace=True)
        x_test.drop(to_drop,axis=1,inplace=True)
        data_cols = list(x_train.columns[ x_train.columns != 'label' ])

    if preprocessor == "MinMax":
        scaler = MinMaxScaler(feature_range=(0, 1))
        x_train[data_cols] = scaler.fit_transform(x_train[data_cols])
        x_test[data_cols] = scaler.transform(x_test[data_cols])
        return x_train, x_test

    if preprocessor == "Robust":
        scaler = RobustScaler(quantile_range=(0.1, 99.9))
        x_train[data_cols] = scaler.fit_transform(x_train[data_cols])
        x_test[data_cols] = scaler.transform(x_test[data_cols])
        return x_train, x_test

    if preprocessor == "power_transform":
        pt = PowerTransformer(method="yeo-johnson")
        x_train[data_cols] = pt.fit_transform(x_train[data_cols])
        x_test[data_cols] = pt.transform(x_test[data_cols])
        return x_train, x_test

    else :
        scaler = StandardScaler()
        x_train[data_cols] = scaler.fit_transform(x_train[data_cols])
        x_test[data_cols] = scaler.transform(x_test[data_cols])
        return x_train, x_test


In [45]:
def get_contant_featues(X,data_cols,threshold=0.995):
    """
    Finds columns with contant value

    Parameters:
    ----------
    X : pandas DataFrame, shape = [n_samples, n_features]
        Dataset to be analyzed
    data_cols : List, array-like
        feature names of the input data X
    threshold : Float
        threshold to determine if a feature has contant value

    Return
    ------
    result : List , array-like
        list of features having a contant value in Data X
     """
    result = []
    for col in data_cols:
        val, counts = np.unique(X[col],return_counts=True)
        v = counts[0]/counts.sum()
        if v > threshold:
            result.append(col)

    return result

In [46]:
train,test, label_mapping = get_data()
data_cols = list(train.columns[ train.columns != 'label' ])
x_train , x_test = preprocess(train,test,data_cols,"Robust",True)

y_train = x_train.label.values
y_test = x_test.label.values

data_cols = list(x_train.columns[ x_train.columns != 'label' ])

to_drop = get_contant_featues(x_train,data_cols)
x_train.drop(to_drop, axis=1,inplace=True)
x_test.drop(to_drop, axis=1,inplace=True)

data_cols = list(x_train.columns[ x_train.columns != 'label' ])

att_ind = np.where(x_train.label != label_mapping["normal"])[0]
x = x_train[data_cols].values[att_ind]
y = y_train[att_ind].reshape(-1,1)

d:\WorkSpace\GAN
True


In [47]:
# rand_dim = np.arange(10,110,10)
base_n_count = np.arange(3,41,3)
ephocs = np.arange(100,5000,100)
batch_sizes = [64,128,250,300,350]
learning_rates = np.logspace(-1,-4,num=20)
num_layers = np.arange(3,20)

optimizers = ["sgd", "RMSprop", "adam", "Adagrad", "Adamax","Nadam"]
activation_func = ["tanh","relu","softplus","linear","elu"]

#create a logs directory
if not os.path.exists('logs'):
    os.makedirs('logs')

In [48]:
# 确保 x 和 labels 都是 PyTorch Tensor
x = torch.from_numpy(x).float()  # NumPy 转换为 Tensor


tot = list(itertools.product([32],ephocs,batch_sizes,[1],[1],\
                             learning_rates,num_layers,activation_func,optimizers,base_n_count))
for i in tot:
    args = list(i)
    cgan = CGAN(args,x,y)
    cgan.train()
    if not cgan.terminated :
        cgan.dump_to_file()

  self.X_train = torch.tensor(X, dtype=torch.float32)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (58630x25 and 26x9)