<a href="https://colab.research.google.com/github/DanhKiD/Bot_GAN/blob/main/Bot_GAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from keras.activations import sigmoid
from keras.backend import binary_crossentropy
from keras.layers import Input, Dense, Activation, dense_attention
from keras.layers.merge import Maximum, Concatenate
from keras.models import Model
from keras.optimizers import Adam
from numpy.lib.function_base import blackman

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import linear_model, svm, tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
#from VOTEClassifier import VOTEClassifier

In [None]:
class BotGAN():
    def __init__(self, blackbox='RF', same_train_data=1, filename='/content/drive/MyDrive/Dataset/train.csv') -> None:
        self.apifeatures_dims = 53
        self.z_dims = 10
        self.hide_layers = 256
        self.generator_layers = [self.apifeatures_dims + self.z_dims, self.hide_layers, self.apifeatures_dims]
        self.substitue_detector_layers = [self.apifeatures_dims, self.hide_layers, 1]
        self.blackbox = blackbox  # RF, LR, DT, SVM, MLP, VOTE
        self.same_train_data = same_train_data # BotGAN and the blackbox_detector are trained on same or different training set
        optimizer = Adam(learning_rate=0.001)
        self.filename = filename

        # Build and Train blackbox_detector
        self.blackbox_detector = self.build_blackbox_detector()

        # Build and compile the substitute_detector
        self.substitue_detector = self.build_substitute_detector()
        self.substitue_detector.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        # Build the generator
        self.generator = self.build_generator()

        #The generator takes botnet and noise as input and generates adversarial botnet examples
        examples = Input(shape=(self.apifeatures_dims,))
        noise = Input(shape=(self.z_dims,))
        input = [examples, noise]
        botnet_examples = self.generator(input)

        # For the combine model we will only train the generator
        self.substitue_detector.trainable = False

        # The discriminator takes generated botnet as input an determines validity
        validity = self.substitue_detector(botnet_examples)

        # The combine model (stacked generator and substitute_detector)
        # Trains the generator to fool the discriminator
        self.combine = Model(input, validity)
        self.combine.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    def build_blackbox_detector(self):
        blackbox_detector = None
        if self.blackbox == 'RF':
            blackbox_detector = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)
        elif self.blackbox == 'SVM':
            blackbox_detector = svm.SVC()
        elif self.blackbox == 'LR':
            blackbox_detector = linear_model.LogisticRegression()
        elif self.blackbox == 'DT':
            blackbox_detector = tree.DecisionTreeRegressor()
        elif self.blackbox == 'MLP':
            blackbox_detector = MLPClassifier(hidden_layer_sizes=(10,), max_iter=10, alpha=1e-1,
                                solver='sgd', verbose=0, tol=1e-4, random_state=1,
                                learning_rate_init=.1)
        #elif self.blackbox == 'VOTE':
        #    blackbox_detector = VOTEClassifier()
        
        return blackbox_detector

    def build_generator(self):
        examples = Input(shape=(self.apifeatures_dims,))
        noise = Input(shape=(self.z_dims,))
        x = Concatenate(axis=1)([examples, noise])
        for dim in self.generator_layers[1:]:
            x = Dense(dim)(x)
        x = Activation(activation='sigmoid')(x)
        x = Maximum()([examples, x])
        generator = Model([examples, noise], x, name='generator')
        generator.summary()

        return generator

    def build_substitute_detector(self):
        input = Input(shape=(self.substitue_detector_layers[0],))
        x = input
        for dim in self.substitue_detector_layers[1:]:
            x = Dense(dim)(x)
        x = Activation(activation='sigmoid')(x)
        substitute_detector = Model(input, x, name='substitute_detector')
        substitute_detector.summary()

        return substitute_detector

    def preprocessing(self):
        # Load dataset
        data = pd.read_csv(self.filename)
        
        # Drop unnecessary columns
        data.drop(["pkSeqID","seq","subcategory", "category"], axis=1, inplace=True)
        
        # Convert source port from hex to dec
        data['sport']=data['sport'].replace(['0x0303'],'771') 
        data['sport']=data['sport'].replace(['0x0011'],'17')
        data['sport']=data['sport'].replace(['0x000d'],'13')
        data['sport']=data['sport'].replace(['0x0008'],'8')

        # Change type from object to int
        data["sport"] = data["sport"].astype(str).astype(int)

        # Encoding data
        le = LabelEncoder()
        data["saddr_enc"]= le.fit_transform(data.saddr)
        data["daddr_enc"]= le.fit_transform(data.daddr)
        data["proto_enc"]= le.fit_transform(data.proto)
        data.drop(['saddr','daddr','proto'], axis=1, inplace=True)

        # Convert dest port from hex to dec
        data['dport']=data.dport.apply(lambda x: int(x,16) if len(x)>1 and x[1]=="x" else int(x))

        # Swap label to end
        titles = list(data.columns)
        titles[11], titles[14] = titles[14], titles[11]
        data = data[titles]
        del titles
        # Scale dataset
        label = data['attack']
        scaler=StandardScaler()
        features = data.iloc[:,:-1]
        cols=features.columns
        scaled_features= scaler.fit_transform(features)
        pre_data = pd.DataFrame(scaled_features,columns=cols)
        pre_data['attack'] = label.values
        del label
        
        return pre_data

    def load_data(self):
        #data = self.preprocessing()
        data = pd.read_csv('/content/drive/MyDrive/Dataset/train.csv')
        ynor = np.array(data[data['label'] == 0]['label'])
        ybot = np.array(data[data['label'] == 1]['label'])
        xnor = np.array(data[data['label'] == 0].iloc[:, :-1])
        xbot = np.array(data[data['label'] == 1].iloc[:, :-1])
        #xbot = data.loc[np.random.choice(data[data['attack'] == 1].index.values, 1148)].iloc[:, :-1]
        
        return (xbot, ybot), (xnor, ynor)

    def train(self, epochs, batch_size=32, is_first=1):
        # Load and split the dataset
        (xbot, ybot), (xnor, ynor) = self.load_data()
        xtrain_bot, xtest_bot, ytrain_bot, ytest_bot = train_test_split(xbot, ybot, test_size=0.3)
        xtrain_nor, xtest_nor, ytrain_nor, ytest_nor = train_test_split(xnor, ynor, test_size=0.3)
        if self.same_train_data:
            bl_xtrain_bot, bl_ytrain_bot, bl_xtrain_nor, bl_ytrain_nor = xtrain_bot, ytrain_bot, xtrain_nor, ytrain_nor
        else:
            xtrain_bot, bl_xtrain_bot, ytrain_bot, bl_ytrain_bot = train_test_split(xtrain_bot, ytrain_bot, test_size=0.5)
            xtrain_nor, bl_xtrain_nor, ytrain_nor, bl_ytrain_nor = train_test_split(xtrain_nor, ytrain_nor, test_size=0.5)
        
        # If is_first is True, train the blackbox_detector
        if is_first:
            self.blackbox_detector.fit(np.concatenate([xbot, xnor]), 
                                    np.concatenate([ybot, ynor]))

        ytrain_nor_blackbox = self.blackbox_detector.predict(bl_xtrain_nor)
        Original_Train_TPR = self.blackbox_detector.score(bl_xtrain_bot, bl_ytrain_bot)
        Original_Test_TPR = self.blackbox_detector.score(xtest_bot, ytest_bot)
        Train_TPR, Test_TPR = [Original_Train_TPR], [Original_Test_TPR]
        best_TPR = 1.0
        print(Train_TPR, Test_TPR, '\n')
        print('Training epochs.....')
        for epoch in range(epochs):
            for step in range(xtrain_bot.shape[0] // batch_size):
                # Train substitute_detector

                # Select a random batch of botnet examples
                idx = np.random.randint(0, xtrain_bot.shape[0], batch_size)
                xbot_batch = xtrain_bot[idx]
                noise = np.random.uniform(0, 1, (batch_size, self.z_dims))
                idx = np.random.randint(0, xbot_batch.shape[0], batch_size)
                xnor_batch = xtrain_nor[idx]
                ynor_batch = ytrain_nor_blackbox[idx]

                # Generate a batch of new malware examples
                gen_examples = self.generator.predict([xnor_batch, noise])
                ybot_batch = self.blackbox_detector.predict(np.ones(gen_examples.shape)*(gen_examples > 0.5))

                # Train the substitute_detector
                d_loss_real = self.substitue_detector.train_on_batch(gen_examples, ybot_batch)
                d_loss_fake = self.substitue_detector.train_on_batch(xnor_batch, ynor_batch)
                d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

                # Train Generator
                idx = np.random.randint(0, xtrain_bot.shape[0], batch_size)
                xbot_batch = xtrain_bot[idx]
                noise = np.random.uniform(0, 1, (batch_size, self.z_dims))

                g_loss = self.combine.train_on_batch([xbot_batch, noise], np.zeros((batch_size, 1)))
            
            # Compute Train TPR
            noise = np.random.uniform(0, 1, (xtrain_bot.shape[0], self.z_dims))
            gen_examples = self.generator.predict([xtrain_bot, noise])
            TPR = self.blackbox_detector.score(np.ones(gen_examples.shape) * (gen_examples > 0.5), ytrain_bot)
            Train_TPR.append(TPR)

            # Compute Test TPR
            noise = np.random.uniform(0, 1, (xtest_bot.shape[0], self.z_dims))
            gen_examples = self.generator.predict([xtest_bot, noise])
            TPR = self.blackbox_detector.score(np.ones(gen_examples.shape) * (gen_examples > 0.5), ytest_bot)
            Test_TPR.append(TPR)
            print(Train_TPR[-1], Test_TPR[-1])

            # Save best model
            if TPR < best_TPR:
                self.combine.save_weights('/content/drive/MyDrive/Dataset/saves/BotGAN.h5')
                best_TPR = TPR
            
            # Plot the progress
            if is_first:
                print("%d [D loss: %f, acc.: %.4f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))
        
        flag = ['DiffTrainData', 'SameTrainData']
        print('\n\n---{0} {1}'.format(self.blackbox, flag[self.same_train_data]))
        print('\nOriginal_Train_TPR: {0}, Adver_Train_TPR: {1}'.format(Original_Train_TPR, Train_TPR[-1]))
        print('\nOriginal_Test_TPR: {0}, Adver_Test_TPR: {1}'.format(Original_Test_TPR, Test_TPR[-1]))

        # Plot TPR
        plt.figure()
        plt.plot(range(len(Train_TPR)), Train_TPR, c='r', label='Training Set', linewidth=2)
        plt.plot(range(len(Test_TPR)), Test_TPR, c='g', linestyle='--', label='Validation Set', linewidth=2)
        plt.xlabel('Epoch')
        plt.ylabel('TPR')
        plt.legend()
        plt.savefig('/content/drive/MyDrive/Dataset/saves/Epoch_TPR{0}, {1}, {2}.png'.format(self.blackbox, flag[self.same_train_data], is_first))
        plt.show()

    def retrain_blackbox_detector(self):
        (xbot, ybot), (xnor, ynor) = self.load_data()
        xtrain_bot, xtest_bot, ytrain_bot, ytest_bot = train_test_split(xbot, ybot, test_size=0.2)
        xtrain_nor, xtest_nor, ytrain_nor, ytest_nor = train_test_split(xnor, ynor, test_size=0.2)

        # Generate Train Adversarial Examples
        noise = np.random.uniform(0, 1, (xtrain_bot.shape[0], self.z_dims))
        gen_examples = self.generator.predict([xtrain_bot, noise])
        gen_examples = np.ones(gen_examples.shape) * (gen_examples > 0.5)
        self.blackbox_detector.fit(np.concatenate([xtrain_bot, xtrain_nor, gen_examples]), 
                                    np.concatenate([ytrain_bot, ytrain_nor, ytrain_bot]))
        
        # Compute train TPR
        train_TPR = self.blackbox_detector.score(gen_examples, ytrain_bot)

        # Compute test TPR
        noise = np.random.uniform(0, 1, (xtest_bot.shape[0], self.z_dims))
        gen_examples = self.generator.predict([xtest_bot, noise])
        gen_examples = np.ones(gen_examples.shape) * (gen_examples > 0.5)
        test_TPR = self.blackbox_detector.score(gen_examples, ytest_bot)

        print('\n---TPR after the blackbox_detector is retrained (Before retraining MalGAN).')
        print('\nTrain_TPR: {0}, Test_TPR: {1}'.format(train_TPR, test_TPR))

In [None]:
botgan_RF = BotGAN(blackbox='RF')
botgan_RF.train(epochs=500, batch_size=4096)
botgan_RF.retrain_blackbox_detector()
botgan_RF.train(epochs=100, batch_size=4096, is_first=False)

In [None]:
botgan_MLP = BotGAN(blackbox='MLP')
botgan_MLP.train(epochs=500, batch_size=4096)
botgan_MLP.retrain_blackbox_detector()
botgan_MLP.train(epochs=100, batch_size=4096, is_first=False)

In [None]:
botgan_LR = BotGAN(blackbox='LR')
botgan_LR.train(epochs=500, batch_size=4096)
botgan_LR.retrain_blackbox_detector()
botgan_LR.train(epochs=100, batch_size=4096, is_first=False)

In [None]:
botgan_SVM = BotGAN(blackbox='SVM')
botgan_SVM.train(epochs=100, batch_size=4096)
botgan_SVM.retrain_blackbox_detector()
botgan_SVM.train(epochs=20, batch_size=4096, is_first=False)