## Load and preprocess dataset

In [1]:
import pandas as pd
dataset = pd.read_csv('/content/drive/MyDrive/anomaly_dataset/dataset.csv')
dataset

Unnamed: 0,version,ip_length,ttl,protocol,source_ip,destination_ip,source_port,destination_port,seq_num,ack_num,tcp_length,ns,cwr,ece,urg,ack,psh,rst,syn,fin,data_size,is_anomaly
0,4,5,64,6,127.0.0.1,127.0.0.1,3000,45572,2385532062,4030603442,8,0,0,0,0,1,1,0,0,0,3,normal
1,4,5,64,6,127.0.0.1,127.0.0.1,45572,3000,4030603442,2385532065,8,0,0,0,0,1,0,0,0,0,0,normal
2,4,5,64,6,127.0.0.1,127.0.0.1,45572,3000,4030603442,2385532065,8,0,0,0,0,1,1,0,0,0,7,normal
3,4,5,64,6,127.0.0.1,127.0.0.1,3000,45572,2385532065,4030603449,8,0,0,0,0,1,0,0,0,0,0,normal
4,4,5,64,6,192.168.11.24,192.168.11.21,44172,3000,3767039168,3365735764,8,0,0,0,0,1,1,0,0,0,7,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5504,4,5,64,6,127.0.0.1,127.0.0.1,3000,45572,2385578962,4030605052,8,0,0,0,0,1,0,0,0,0,0,normal
5505,4,5,64,6,192.168.11.24,192.168.11.21,44172,3000,3767052068,3365782661,8,0,0,0,0,1,1,0,0,0,7,normal
5506,4,5,64,6,192.168.11.23,192.168.11.21,47070,3000,4046117386,3222510617,8,0,0,0,0,1,1,0,0,0,7,normal
5507,4,5,64,6,192.168.11.22,192.168.11.21,58688,3000,735286812,2048439313,8,0,0,0,0,1,1,0,0,0,7,normal


# New Section

In [2]:
def is_local(row):
   if row['destination_ip'] == '127.0.0.1' or row['source_ip'] == '127.0.0.1':
      return 1
   return -1

dataset['is_local'] = dataset.apply(is_local, axis=1)

In [3]:
def is_src_3000(row):
   if row['source_port'] == 3000:
      return 1
   return -1

dataset['is_src_3000'] = dataset.apply(is_src_3000, axis=1)

In [4]:
def is_dst_3000(row):
   if row['destination_port'] == 3000:
      return 1
   return -1

dataset['is_dst_3000'] = dataset.apply(is_dst_3000, axis=1)

In [5]:
def is_ack_0(row):
   if row['ack_num'] == 0:
      return 1
   return -1

dataset['is_ack_0'] = dataset.apply(is_ack_0, axis=1)

In [6]:
# drop non-numeric data (port/seq/ack are categorical, but there are too many categories and too little data for each category)
dataset.drop(['source_ip', 'destination_ip', 'source_port', 'destination_port', 'seq_num', 'ack_num'], axis=1, inplace=True)

In [7]:
dataset.nunique()

version         1
ip_length       1
ttl            24
protocol        1
tcp_length      3
ns              1
cwr             1
ece             1
urg             2
ack             2
psh             2
rst             1
syn             2
fin             2
data_size      21
is_anomaly      2
is_local        2
is_src_3000     2
is_dst_3000     2
is_ack_0        2
dtype: int64

In [8]:
# drop data that has only 1 value
dataset.drop(['version', 'ip_length', 'protocol', 'ns', 'cwr', 'ece', 'rst'], axis=1, inplace=True)

In [9]:
dataset.describe(percentiles=[])

Unnamed: 0,ttl,tcp_length,urg,ack,psh,syn,fin,data_size,is_local,is_src_3000,is_dst_3000,is_ack_0
count,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0
mean,56.91414,6.773643,0.131421,0.501361,0.472137,0.053912,0.255763,68.942821,-0.617353,-0.798148,-0.191142,-0.002723
std,9.033006,1.647644,0.337891,0.500044,0.499268,0.225864,0.436329,211.857119,0.786757,0.602515,0.981652,1.000087
min,37.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0
50%,64.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0
max,64.0,10.0,1.0,1.0,1.0,1.0,1.0,845.0,1.0,1.0,1.0,1.0


In [10]:
# normalize data
dataset['ttl'] = dataset['ttl'] / 32.0 - 1
# length of tcp header can be in 5..15 range
dataset['tcp_length'] = (dataset['tcp_length'] - 10.0) / 5.0
dataset['urg'] = dataset['urg'] * 2 - 1
dataset['ack'] = dataset['ack'] * 2 - 1
dataset['psh'] = dataset['psh'] * 2 - 1
dataset['syn'] = dataset['syn'] * 2 - 1
dataset['fin'] = dataset['fin'] * 2 - 1
dataset['data_size'] = dataset['data_size'] / 845.0 * 2.0 - 1.0

In [11]:
dataset['is_anomaly'] = dataset.pop('is_anomaly')

def is_anomaly(row):
   if row['is_anomaly'] == 'anomaly':
      return 1
   return 0

dataset['is_anomaly'] = dataset.apply(is_anomaly, axis=1)

In [12]:
dataset

Unnamed: 0,ttl,tcp_length,urg,ack,psh,syn,fin,data_size,is_local,is_src_3000,is_dst_3000,is_ack_0,is_anomaly
0,1.0,-0.4,-1,1,1,-1,-1,-0.992899,1,1,-1,-1,0
1,1.0,-0.4,-1,1,-1,-1,-1,-1.000000,1,-1,1,-1,0
2,1.0,-0.4,-1,1,1,-1,-1,-0.983432,1,-1,1,-1,0
3,1.0,-0.4,-1,1,-1,-1,-1,-1.000000,1,1,-1,-1,0
4,1.0,-0.4,-1,1,1,-1,-1,-0.983432,-1,-1,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5504,1.0,-0.4,-1,1,-1,-1,-1,-1.000000,1,1,-1,-1,0
5505,1.0,-0.4,-1,1,1,-1,-1,-0.983432,-1,-1,1,-1,0
5506,1.0,-0.4,-1,1,1,-1,-1,-0.983432,-1,-1,1,-1,0
5507,1.0,-0.4,-1,1,1,-1,-1,-0.983432,-1,-1,1,-1,0


In [13]:
dataset.describe(percentiles=[])

Unnamed: 0,ttl,tcp_length,urg,ack,psh,syn,fin,data_size,is_local,is_src_3000,is_dst_3000,is_ack_0,is_anomaly
count,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0,5509.0
mean,0.778567,-0.645271,-0.737157,0.002723,-0.055727,-0.892176,-0.488473,-0.836822,-0.617353,-0.798148,-0.191142,-0.002723,0.496642
std,0.282281,0.329529,0.675782,1.000087,0.998537,0.451728,0.872658,0.501437,0.786757,0.602515,0.981652,1.000087,0.500034
min,0.15625,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
50%,1.0,-0.4,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
max,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset, test_size=0.2)

def separate_xy(data):
    return data.iloc[:,0:12], data['is_anomaly']

train_x, train_y = separate_xy(train)
test_x, test_y = separate_xy(test)

train_y.value_counts()

0    2215
1    2192
Name: is_anomaly, dtype: int64

## Model

In [15]:
from keras.models import Sequential
from keras.layers import Dense
import numpy as np

In [16]:
def create_model():
  model = Sequential()

  model.add(Dense(32, activation='relu', input_shape=(12,)))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(loss='binary_crossentropy',
                metrics=['accuracy'])

  return model

In [17]:
def get_weights(model):
  weights = []
  for layer in model.layers:
    weights.append(layer.get_weights())
  return weights

In [18]:
def set_weights(model, weights):
  for i in range(len(weights)):
    model.layers[i].set_weights(weights[i])

In [19]:
def randomize_ndarray(a, limit, p):
  rand_add = np.random.uniform(-limit, limit, a.shape)
  if p < 1:
    mask = np.random.uniform(0, 1, a.shape) > p
    rand_add[mask] = np.zeros(a.shape)[mask]

  return a + rand_add
  

def randomize_weights(weights, p):
  new_weights = []
  for w, b in weights[:-1]:
    new_w = randomize_ndarray(w, 0.4, p)
    new_b = randomize_ndarray(b, 0.1, p)
    new_weights.append([new_w, new_b])
  
  # last layer is modified with different coefficients
  w, b = weights[-1]
  new_w = randomize_ndarray(w, 0.5, p)
  new_b = randomize_ndarray(b, 0.03, p)
  new_weights.append([new_w, new_b])
  
  return new_weights

In [20]:
def spawn_childs(population, n_childs, p):
  original_size = len(population)
  for i in range(original_size):
    for j in range(n_childs):
      population.append(randomize_weights(population[i], p))

In [21]:
def get_loss(weights, model):
  set_weights(model, weights)
  loss, accuracy = model.evaluate(train_x, train_y, verbose=0)
  return loss

def choose_best(population, model, n_best):
  population.sort(key=lambda weights: get_loss(weights, model))
  return population[:n_best]

In [22]:
model = create_model()
weights = get_weights(model)

In [23]:
n_best = 10
n_childs = 9
iterations = 15
p = 0.2

population = list(randomize_weights(weights, 1) for i in range(n_best * (n_childs + 1)))

for i in range(iterations):
  population = choose_best(population, model, n_best)

  set_weights(model, population[0])
  train_loss, train_acc = model.evaluate(train_x, train_y, verbose=0)
  test_loss, test_acc = model.evaluate(test_x, test_y, verbose=0)
  print(f'Iteration {i}, train loss {train_loss:.3}, train acc {train_acc:}, test loss {test_loss:.3}, test acc {test_acc:}')

  if i != iterations - 1:
    spawn_childs(population, n_childs, p)


Iteration 0, train loss 0.348, train acc 0.9961425065994263, test loss 0.346, test acc 0.9972776770591736
Iteration 1, train loss 0.217, train acc 0.9975039958953857, test loss 0.22, test acc 0.9972776770591736
Iteration 2, train loss 0.0852, train acc 0.9546176791191101, test loss 0.0831, test acc 0.9573502540588379
Iteration 3, train loss 0.0342, train acc 0.9979577660560608, test loss 0.0338, test acc 0.9981850981712341
Iteration 4, train loss 0.00925, train acc 0.9979577660560608, test loss 0.00858, test acc 0.9981850981712341
Iteration 5, train loss 0.00422, train acc 0.9979577660560608, test loss 0.00355, test acc 0.9981850981712341
Iteration 6, train loss 0.00288, train acc 0.9995461702346802, test loss 0.00283, test acc 1.0
Iteration 7, train loss 0.00195, train acc 1.0, test loss 0.00186, test acc 1.0
Iteration 8, train loss 0.000899, train acc 1.0, test loss 0.000874, test acc 1.0
Iteration 9, train loss 0.000271, train acc 1.0, test loss 0.00023, test acc 1.0
Iteration 10, t

In [24]:
model.save('trained_model_genetic.h5')

## Check

In [25]:
from keras.models import load_model
loaded_model = load_model('trained_model_genetic.h5')

In [26]:
import numpy as np

def check(x):
  if loaded_model(x)[0][0] > 0.5:
    print('anomaly detected')
  else:
    print('packet is normal')

x1 = np.array([[1,-0.4,-1,1,1,-1,-1,-0.98,-1,-1,1,-1]])  # normal
x2 = np.array([[0.375,-1,1,-1,1,-1,1,-1,-1,-1,-1,1]])    # anomaly

check(x1)
check(x2)

packet is normal
anomaly detected


In [27]:
x = np.array([[1.0, 0.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0]])  # smb scan
check(x)

anomaly detected
