In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

## Data preprocessing

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data_train=pd.read_csv('/content/drive/MyDrive/AML_assignment_2/UNSW_NB15_training-set.csv')
data_test=pd.read_csv('/content/drive/MyDrive/AML_assignment_2/UNSW_NB15_testing-set.csv')

In [4]:
data=pd.concat([data_train,data_test],ignore_index=True)
y=data['label']
data.drop('label',axis=1,inplace=True)

### Exploring metrics

In [5]:
unique_cat={}
cat_features=[]
for column in data.columns:
  if len(data[column].unique())<=12:
    unique_cat[column]=data[column].value_counts()/data.shape[0]
  if type(data[column][3])==str:
    cat_features.append(column)


In [6]:
for key in unique_cat:
  print(unique_cat[key])

FIN    0.454700
INT    0.451883
CON    0.078138
REQ    0.014875
RST    0.000326
ECO    0.000047
ACC    0.000016
CLO    0.000004
PAR    0.000004
URN    0.000004
no     0.000004
Name: state, dtype: float64
0      0.467410
252    0.310172
29     0.217807
60     0.004238
30     0.000167
31     0.000147
253    0.000035
32     0.000016
254    0.000008
Name: dttl, dtype: float64
2    0.450781
1    0.297233
0    0.224455
6    0.014491
3    0.012927
4    0.000109
5    0.000004
Name: ct_state_ttl, dtype: float64
0    0.987407
1    0.012493
4    0.000062
2    0.000039
Name: is_ftp_login, dtype: float64
0    0.987399
1    0.012477
2    0.000062
4    0.000062
Name: ct_ftp_cmd, dtype: float64
0     0.901348
1     0.091604
4     0.005138
9     0.000838
2     0.000357
16    0.000186
6     0.000163
12    0.000140
30    0.000116
25    0.000097
3     0.000012
Name: ct_flw_http_mthd, dtype: float64
0    0.985726
1    0.014274
Name: is_sm_ips_ports, dtype: float64
Normal            0.360923
Generic        

We can see that for 'is_sm_ips_ports', 'ct_ftp_cmd', 'is_ftp_login' more than 98% of column equals to specififc value. So we can remove these features. Also we exclude 'proto' because it contains more than 130 unique 

In [7]:
constant_columns=['is_sm_ips_ports', 'ct_ftp_cmd', 'is_ftp_login','proto']
data.drop(columns=constant_columns,inplace=True)


In [8]:
cat_features.remove('proto')

In [9]:
cat_features

['service', 'state', 'attack_cat']

### Fill missing values

In [10]:
from sklearn.impute import SimpleImputer
for column in data.columns[1:]: 
  imp_mean = SimpleImputer( strategy='most_frequent')
  imp_mean.fit(np.array(data[column]).reshape(-1,1))
  data[column]=imp_mean.transform(np.array(data[column]).reshape(-1,1))



In [11]:
# from sklearn.preprocessing import LabelEncoder
# for cat in cat_features:
#   encoder=LabelEncoder()

#   data[cat]=encoder.fit_transform(data[cat])

data=pd.get_dummies(data,columns=cat_features)


In [12]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
data = scaler.fit_transform(data)


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( data, y, test_size=0.2, random_state=42)

## Conditional GAN

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [16]:
device

device(type='cuda', index=0)

In [17]:
data_size=X_train.shape[1]
n_classes=2


In [18]:
from torch.utils.data import Dataset, DataLoader

class df(Dataset):
    def __init__(self, X, y, transform=None):
        self.transform = transform
        self.labels = y
        self.data=X

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = self.labels[idx]
        data=self.data[idx]
        
        if self.transform:
            data = self.transform(data)

        return data, label

In [19]:

## Undercomplete
class Generator(nn.Module):
    def __init__(self, data_size=data_size,n_classes=n_classes):
      super(Generator, self).__init__()

      self.label_emb = nn.Embedding(n_classes, n_classes)
      input_size=n_classes + data_size
      self.hidden_layer1 = nn.Sequential(
          nn.Linear(input_size, 128),
          nn.LeakyReLU(0.2)

      )

      self.hidden_layer2 = nn.Sequential(
          nn.Linear(128, 256),
          nn.LeakyReLU(0.2)

      )
      self.hidden_layer3 = nn.Sequential(
          nn.Linear(256, 512),
          nn.LeakyReLU(0.2),

      )
      self.hidden_layer4 = nn.Sequential(
          nn.Linear(512, data_size),
          nn.Tanh()

      )

        

        
    def forward(self, data,labels,n_classes=2):

      get_input = torch.cat((self.label_emb(labels.to(device)), data.to(device)), -1)
      x = get_input
      output=self.hidden_layer1(x)
      output=self.hidden_layer2(output)
      output=self.hidden_layer3(output)
      output=self.hidden_layer4(output)
      return output.to(device)
        
        
    
class Discriminator(nn.Module):
    def __init__(self, data_size=data_size,n_classes=n_classes):

      super(Discriminator, self).__init__()
      self.label_emb = nn.Embedding(n_classes, n_classes)
      # Step 1 : Define the encoder 
      # Step 2 : Define the decoder
      # Step 3 : Initialize the weights (optional)
      input_size = n_classes + data_size

      self.hidden_layer1 = nn.Sequential(
          nn.Linear(input_size, 512),
          nn.LeakyReLU(0.2),
          nn.Dropout(0.3)

      )

      self.hidden_layer2 = nn.Sequential(
          nn.Linear(512, 256),
          nn.LeakyReLU(0.2),
          nn.Dropout(0.3)

      )
      self.hidden_layer3 = nn.Sequential(
          nn.Linear(256, 128),
          nn.LeakyReLU(0.2),
          nn.Dropout(0.3)

      )
      self.hidden_layer4 = nn.Sequential(
          nn.Linear(128,1)

      )

        

        
    def forward(self, data,labels):
      # Step 1: Pass the input through encoder to get latent representation
      # Step 2: Take latent representation and pass through decoder
      get_input = torch.cat((self.label_emb(labels), data), -1)
      x = get_input
      output=self.hidden_layer1(x)
      output=self.hidden_layer2(output)
      output=self.hidden_layer3(output)
      output=self.hidden_layer4(output)
      return output.to(device)
        


In [20]:
batchSize=50
learning_rate = 0.0005
num_epochs = 20
generator=Generator()
discriminator=Discriminator()
if torch.cuda.is_available():
  generator.cuda()
  discriminator.cuda()

# print(summary(AE,input_size=(1, 64)))

criterion_gen = nn.KLDivLoss()
criterion=nn.MSELoss()
if torch.cuda.is_available():
  criterion.cuda()
  criterion_gen.cuda()
X_tensor_train = torch.tensor(X_train).float().to(device)
X_tensor_test = torch.tensor(X_test).float().to(device)
y_tensor_train = torch.tensor(y_train.values).float().to(device)
y_tensor_test = torch.tensor(y_test.values).float().to(device)



data_loader = DataLoader(df(X_tensor_train,y_tensor_train),batch_size=batchSize,shuffle=True)

gen_optimizer = torch.optim.Adam(generator.parameters(), lr=learning_rate)
dis_optimizer = torch.optim.Adam(discriminator.parameters(), lr=learning_rate)
#Create a random dataset
# del full_train

In [21]:
torch.cuda.is_available()

True

In [22]:
FloatTensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if torch.cuda.is_available() else torch.LongTensor

In [31]:
from torch.autograd import Variable

In [23]:
losses=[]


from tqdm import tqdm

for epoch in tqdm(range(num_epochs)):
  gen_losses=[]
  dis_losses=[]

  for i, (x,y) in enumerate(data_loader):

      batch_size = x.shape[0]

      # Adversarial ground truths
      valid = Variable(FloatTensor(batch_size, 1).fill_(1.0), requires_grad=False)
      fake = Variable(FloatTensor(batch_size, 1).fill_(0.0), requires_grad=False)

      # Configure input
      real_data = Variable(x.type(FloatTensor))
      labels = Variable(y.type(LongTensor))

      # -----------------
      #  Train Generator
      # -----------------

      gen_optimizer.zero_grad()

      # Sample noise and labels as generator input
      z = Variable(FloatTensor(np.random.normal(0, 1, (batch_size, data.shape[1]))))
      gen_labels = Variable(LongTensor(np.random.randint(0, n_classes,batch_size)))

      # Generate new data
      gen_imgs = generator(z.to(device), gen_labels.to(device))

      # Loss measures generator's ability to fool the discriminator
      validity = discriminator(gen_imgs, gen_labels)
      g_loss = criterion_gen(validity, valid)
      gen_losses.append(g_loss.item())
      g_loss.backward()
      gen_optimizer.step()

      # ---------------------
      #  Train Discriminator
      # ---------------------

      dis_optimizer.zero_grad()

      # Loss for real data
      validity_real = discriminator(real_data, labels)
      d_real_loss = criterion(validity_real, valid)

      # Loss for fake data
      validity_fake = discriminator(gen_imgs.detach(), gen_labels)
      d_fake_loss = criterion(validity_fake, fake)

      # Total discriminator loss
      d_loss = (d_real_loss + d_fake_loss) / 2
      dis_losses.append(d_loss.item())
      d_loss.backward()
      dis_optimizer.step()
      # if i%0==0:
      #   print(
      #       "[Epoch %d/%d] [Batch %d/%d] [D loss: %f] [G loss: %f]"
      #       % (epoch, num_epochs, i, len(data_loader), d_loss.item(), g_loss.item())
      #   )

      # batches_done = epoch * len(data_loader) + i
  losses.append((np.mean(gen_losses),np.mean(dis_losses)))

        # if batches_done % opt.sample_interval == 0:
        #     sample_image(n_row=10, batches_done=batches_done)
  # log
  print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, num_epochs, d_loss.item()))

  "reduction: 'mean' divides the total loss by both the batch size and the support size."
  5%|▌         | 1/20 [00:28<09:00, 28.45s/it]

epoch [1/20], loss:1.4958


 10%|█         | 2/20 [01:09<10:50, 36.15s/it]

epoch [2/20], loss:0.5132


 15%|█▌        | 3/20 [01:41<09:37, 33.94s/it]

epoch [3/20], loss:38915.4023


 20%|██        | 4/20 [02:09<08:24, 31.55s/it]

epoch [4/20], loss:0.1258


 25%|██▌       | 5/20 [02:37<07:37, 30.48s/it]

epoch [5/20], loss:5453.4214


 30%|███       | 6/20 [03:04<06:50, 29.31s/it]

epoch [6/20], loss:0.1589


 35%|███▌      | 7/20 [03:38<06:40, 30.77s/it]

epoch [7/20], loss:0.0723


 40%|████      | 8/20 [04:07<06:03, 30.33s/it]

epoch [8/20], loss:0.6828


 45%|████▌     | 9/20 [04:36<05:28, 29.87s/it]

epoch [9/20], loss:1.6246


 50%|█████     | 10/20 [05:06<04:56, 29.69s/it]

epoch [10/20], loss:0.1755


 55%|█████▌    | 11/20 [05:38<04:35, 30.67s/it]

epoch [11/20], loss:0.0974


 60%|██████    | 12/20 [06:06<03:57, 29.68s/it]

epoch [12/20], loss:0.6522


 65%|██████▌   | 13/20 [06:36<03:27, 29.70s/it]

epoch [13/20], loss:0.2048


 70%|███████   | 14/20 [07:07<03:00, 30.13s/it]

epoch [14/20], loss:0.1388


 75%|███████▌  | 15/20 [07:36<02:28, 29.77s/it]

epoch [15/20], loss:0.2880


 80%|████████  | 16/20 [08:05<01:57, 29.48s/it]

epoch [16/20], loss:0.2404


 85%|████████▌ | 17/20 [08:32<01:26, 28.87s/it]

epoch [17/20], loss:2.7767


 90%|█████████ | 18/20 [09:07<01:01, 30.73s/it]

epoch [18/20], loss:6202.6152


 95%|█████████▌| 19/20 [09:34<00:29, 29.70s/it]

epoch [19/20], loss:24361.7559


100%|██████████| 20/20 [10:06<00:00, 30.31s/it]

epoch [20/20], loss:0.9651





In [24]:

torch.save(generator, '/content/drive/MyDrive/AML_assignment_2/gen.pth')
torch.save(discriminator, '/content/drive/MyDrive/AML_assignment_2/dis.pth')

In [23]:
gen=torch.load( '/content/drive/MyDrive/AML_assignment_2/gen.pth')
dis=torch.load( '/content/drive/MyDrive/AML_assignment_2/dis.pth')

### Discriminator loss

In [24]:
import matplotlib.pyplot as plt

temp=[loss[1] for loss in losses ]
plt.plot(range(len(temp)),temp)

### Generator loss

In [None]:

temp=[loss[0] for loss in losses ]
plt.plot(range(len(temp)),temp)

### Save losses

In [None]:
l={}
l['dis loss']=[loss[1] for loss in losses ]
l['gen loss']=[loss[0] for loss in losses ]


In [None]:
l_fd=pd.DataFrame(l)

In [None]:
l_fd.to_csv('/content/drive/MyDrive/AML_assignment_2/loss.csv')

After generating the network intrusion and balanced the data, it is important to see if balancing the data helped to improve a ML classifier perfomence. There are a lot of ML classifiers that can be used. For this assignment we will only take 3 into consideration. The 3 classifiers are Random Forest, Explainable Boosting Machine and Classical Neural Network.



### Balance data. Generate new observations with labels equal to 0.

In [26]:
disbalance=y_train.value_counts()[1]-y_train.value_counts()[0]

In [27]:
disbalance

57488

In [32]:
import random
#Generate to noise to input of generator

z = Variable(FloatTensor(np.random.normal(0, 1, (57000, data.shape[1]))))
gen_labels = Variable(LongTensor(np.zeros(57000)))

with torch.no_grad():
  new_data = gen(z, gen_labels).cpu().numpy()

In [33]:
X_train_ext = np.vstack((X_train, new_data))
y_train_ext = np.hstack((y_train, np.zeros(57000)))

In [34]:
y_train.value_counts()

1    131813
0     74325
Name: label, dtype: int64

In [35]:
np.unique(y_train_ext, return_counts=True)

(array([0., 1.]), array([131325, 131813]))

### Compare metrics of classification for extended dataset and dataset without generated data.

After generating the network intrusion and balanced the data, We will check if balancing the data helped to improve a ML classifier perfomence. For this assignment we will only take 3 into consideration. The 3 classifiers are Random Forest, Explainable Boosting Machine and Classical Neural Network.

In [None]:
#pip install interpret

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from interpret.glassbox import ExplainableBoostingClassifier

In [38]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [39]:
from sklearn.model_selection import cross_validate
def cross_validation(model, _X, _y, _cv=5):
      '''Function to perform 5 Folds Cross-Validation
       Parameters
       ----------
      model: Python Class, default=None
              This is the machine learning algorithm to be used for training.
      _X: array
           This is the matrix of features.
      _y: array
           This is the target variable.
      _cv: int, default=5
          Determines the number of folds for cross-validation.
       Returns
       -------
       The function returns a dictionary containing the metrics 'accuracy', 'precision',
       'recall', 'f1' for both training set and validation set.
      '''
      _scoring = ['accuracy', 'precision', 'recall', 'f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      
      return {"Training Accuracy scores": results['train_accuracy'],
              "Mean Training Accuracy": results['train_accuracy'].mean()*100,
              "Training Precision scores": results['train_precision'],
              "Mean Training Precision": results['train_precision'].mean(),
              "Training Recall scores": results['train_recall'],
              "Mean Training Recall": results['train_recall'].mean(),
              "Training F1 scores": results['train_f1'],
              "Mean Training F1 Score": results['train_f1'].mean(),
              "Validation Accuracy scores": results['test_accuracy'],
              "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
              "Validation Precision scores": results['test_precision'],
              "Mean Validation Precision": results['test_precision'].mean(),
              "Validation Recall scores": results['test_recall'],
              "Mean Validation Recall": results['test_recall'].mean(),
              "Validation F1 scores": results['test_f1'],
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

### Random forest classifier

 without extended data

In [40]:
print('Random forest')
rfc=RandomForestClassifier()
# rfc.fit(X_train,y_train)
# y_pred = rfc.predict(X_test)
# print(classification_report(y_test, y_pred))
rfc_res=cross_validation(rfc, X_train, y_train, _cv=5)

Random forest


In [41]:
rfc_res

{'Training Accuracy scores': array([1., 1., 1., 1., 1.]),
 'Mean Training Accuracy': 100.0,
 'Training Precision scores': array([1., 1., 1., 1., 1.]),
 'Mean Training Precision': 1.0,
 'Training Recall scores': array([1., 1., 1., 1., 1.]),
 'Mean Training Recall': 1.0,
 'Training F1 scores': array([1., 1., 1., 1., 1.]),
 'Mean Training F1 Score': 1.0,
 'Validation Accuracy scores': array([1., 1., 1., 1., 1.]),
 'Mean Validation Accuracy': 100.0,
 'Validation Precision scores': array([1., 1., 1., 1., 1.]),
 'Mean Validation Precision': 1.0,
 'Validation Recall scores': array([1., 1., 1., 1., 1.]),
 'Mean Validation Recall': 1.0,
 'Validation F1 scores': array([1., 1., 1., 1., 1.]),
 'Mean Validation F1 Score': 1.0}

Add extended data

In [42]:
print('Random forest')
rfc=RandomForestClassifier()
rfc.fit(X_train_ext,y_train_ext)
y_pred = rfc.predict(X_test)
print(classification_report(y_test, y_pred))

Random forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18675
           1       1.00      1.00      1.00     32860

    accuracy                           1.00     51535
   macro avg       1.00      1.00      1.00     51535
weighted avg       1.00      1.00      1.00     51535



### MLPClassifier

Without synthetic data

In [43]:
mpl = MLPClassifier(hidden_layer_sizes=(50))
print(cross_validation(mpl, X_train, y_train, _cv=5))

{'Training Accuracy scores': array([0.99725305, 0.99839306, 0.99710145, 0.99773211, 0.99853861]), 'Mean Training Accuracy': 99.78036549988288, 'Training Precision scores': array([0.99636931, 0.99823792, 0.99965729, 0.99663481, 0.99955367]), 'Mean Training Precision': 0.9980905997230469, 'Training Recall scores': array([0.99934566, 0.99925083, 0.99580844, 0.9998293 , 0.99816028]), 'Mean Training Recall': 0.9984789037655997, 'Training F1 scores': array([0.99785527, 0.99874412, 0.99772915, 0.9982295 , 0.99885649]), 'Mean Training F1 Score': 0.9982829058006756, 'Validation Accuracy scores': array([0.99755021, 0.99805957, 0.99755021, 0.99793824, 0.99837485]), 'Mean Validation Accuracy': 99.78946168288336, 'Validation Precision scores': array([0.99685939, 0.99784066, 0.99977164, 0.99701154, 0.99939226]), 'Mean Validation Precision': 0.99817509785162, 'Validation Recall scores': array([0.99931722, 0.99912757, 0.99639646, 0.9997724 , 0.9980654 ]), 'Mean Validation Recall': 0.9985358103235354, 

With extended data

In [44]:
mpl = MLPClassifier(hidden_layer_sizes=(50))
print(cross_validation(mpl, X_train_ext, y_train_ext, _cv=5))

{'Training Accuracy scores': array([0.99859864, 0.99858914, 0.99845613, 0.99899768, 0.99922094]), 'Mean Training Accuracy': 99.87725065615692, 'Training Precision scores': array([0.99723848, 0.99731368, 0.99706842, 0.99980054, 0.99863604]), 'Mean Training Precision': 0.9980114326621112, 'Training Recall scores': array([0.99997155, 0.99987672, 0.99985775, 0.99819822, 0.99981034]), 'Mean Training Recall': 0.9995429151094287, 'Training F1 scores': array([0.99860315, 0.99859356, 0.99846114, 0.99899873, 0.99922284]), 'Mean Training F1 Score': 0.9987758838415907, 'Validation Accuracy scores': array([0.9970548 , 0.99692179, 0.99726381, 0.99868889, 0.99982899]), 'Mean Validation Accuracy': 99.79516550804554, 'Validation Precision scores': array([0.99419219, 0.99422794, 0.99479147, 0.99980991, 0.99992412]), 'Mean Validation Precision': 0.9965891244346503, 'Validation Recall scores': array([0.99996207, 0.99965861, 0.99977241, 0.99757226, 0.99973447]), 'Mean Validation Recall': 0.9993399636363796

ExplainableBoostingClassifier

In [45]:
ebc = ExplainableBoostingClassifier()
cross_validation(mpl, X_train, y_train, _cv=5)

{'Training Accuracy scores': array([0.99763507, 0.99885998, 0.99617367, 0.9980535 , 0.99670125]),
 'Mean Training Accuracy': 99.74846949793867,
 'Training Precision scores': array([0.99644653, 0.99981956, 0.99986647, 0.99734133, 0.999838  ]),
 'Mean Training Precision': 0.9986623797364131,
 'Training Recall scores': array([0.99986724, 0.99839734, 0.99414889, 0.99962068, 0.99500242]),
 'Mean Training Recall': 0.9974073122368988,
 'Training F1 scores': array([0.99815395, 0.99910795, 0.99699948, 0.9984797 , 0.99741435]),
 'Mean Training F1 Score': 0.99803108683872,
 'Validation Accuracy scores': array([0.99772   , 0.99878723, 0.99626467, 0.99808378, 0.99694375]),
 'Mean Validation Accuracy': 99.75598866184656,
 'Validation Precision scores': array([0.99674748, 0.99984803, 0.99969495, 0.99742609, 0.99977141]),
 'Mean Validation Precision': 0.9986975918363565,
 'Validation Recall scores': array([0.99969654, 0.99825513, 0.99446194, 0.99958273, 0.99544799]),
 'Mean Validation Recall': 0.99748

In [46]:
ebc = ExplainableBoostingClassifier()
print(cross_validation(mpl, X_train_ext, y_train_ext, _cv=5))

{'Training Accuracy scores': array([0.99856064, 0.99636122, 0.99857964, 0.99790035, 0.99796685]), 'Mean Training Accuracy': 99.7873739140944, 'Training Precision scores': array([0.99721006, 0.99941797, 0.99718195, 0.99582598, 0.99598576]), 'Mean Training Precision': 0.9971243432244277, 'Training Recall scores': array([0.99992413, 0.99331437, 0.99999052, 1.        , 0.99997155]), 'Mean Training Recall': 0.9986401138519663, 'Training F1 scores': array([0.99856525, 0.99635682, 0.99858426, 0.99790862, 0.99797467]), 'Mean Training F1 Score': 0.9978779263635541, 'Validation Accuracy scores': array([0.99692179, 0.99631375, 0.99739682, 0.99988599, 0.999981  ]), 'Mean Validation Accuracy': 99.80998702857354, 'Validation Precision scores': array([0.9940043 , 0.99881819, 0.99490489, 0.99984828, 1.        ]), 'Mean Validation Precision': 0.9975151320558165, 'Validation Recall scores': array([0.9998862 , 0.99381709, 0.99992414, 0.99992413, 0.99996207]), 'Mean Validation Recall': 0.9987027264438785,

In [None]:
ebс_global = ebс.explain_global(name='EBM')
show(ebс_global)