<a href="https://colab.research.google.com/github/fle1scha/DL4NTP/blob/main/DL4NTP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Import tensorflow and numpy
import tensorflow as tf
import numpy as np 
import pandas as pd

In [3]:
#Import keras models
from random import random
from numpy import array
from numpy import cumsum
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed

In [8]:
#Import any other libraries needed
import math
import os

1. Mount Google Drive file system

In [None]:

from google.colab import drive
drive.mount('/content/gdrive')

KeyboardInterrupt: ignored

2. Read in SANReN sample data.

In [None]:
with open('/content/gdrive/My Drive/SANReN.txt') as f:
  SANReN = f.readlines()
  
#Iterate through first 10 rows to ensure data has been read correctly. 
'''for i in range(10):
  print(SANReN[i])'''

'for i in range(10):\n  print(SANReN[i])'

# ***Data Preprocessing***

3. Clean dataframe headers

In [None]:
headings_line = SANReN[0].split()
#Merge 'Src', 'IP', and 'Addr:Port' 
headings_line[4:7] = [''.join(headings_line[4:7])]
#Merge 'Dst', 'IP', and 'Addr:Port' 
headings_line[5:8] = [''.join(headings_line[5:8])]
#Remove 'Flags', 'Tos', and 'Flows'.
headings_line = headings_line[0:6] + headings_line[8:13]


4. Clean time-series data points

In [None]:
framedata = []
for i in range(1, len(SANReN)):
  data_line = SANReN[i].split()

  if (data_line[11] == "M" and data_line[14] == 'M'): #Bytes and BPS in megabytes
    #print("1 and 2") 
    data_line = data_line[0:5] + data_line[6:7] + data_line[9:16]
    data_line[7:9] = [''.join(data_line[7:9])]
    data_line[9:11] = [''.join(data_line[9:11])]
  
  elif (data_line[13] == 'M'): #BPS measured in megabytes
    #print("2")
    data_line = data_line[0:5] + data_line[6:7] + data_line[9:15]
    data_line[9:11] = [''.join(data_line[9:11])]
    
  elif data_line[11] == 'M': #Bytes measured in megabytes
    #print("1")
    data_line = data_line[0:5] + data_line[6:7] + data_line[9:15]
    data_line[7:9] = [''.join(data_line[7:9])]
  
  else: #No megabyte metrics
    #print("0")
    data_line = data_line[0:5] + data_line[6:7] + data_line[9:14]
  
  framedata.append(data_line)
  

5. Convert numpy array into pandas dataframe and add additional columns.
  - Day: gives the day of the week as an integer. Monday is 0 and Sunday is 6.

In [None]:
df = pd.DataFrame(np.array(framedata), columns=headings_line)
df['Date'] = pd.to_datetime(df['Date'])
df["Day"] = df['Date'].dt.dayofweek
print(df.head(n=10))

        Date    first-seen Duration Proto  ...   pps     bps   Bpp Day
0 2020-07-04  20:10:06.480    1.223   TCP  ...  3679    1.5M    52   5
1 2020-07-04  20:09:01.555   78.205   TCP  ...   396    4.8M  1500   5
2 2020-07-04  20:10:01.690    5.307   TCP  ...   188   60297    40   5
3 2020-07-04  20:09:23.019   43.982   TCP  ...    45  545677  1500   5
4 2020-07-04  20:10:07.007    0.000   TCP  ...     0       0  1500   5
5 2020-07-04  20:10:06.357    0.656   TCP  ...  3810    1.6M    52   5
6 2020-07-04  20:09:56.447   15.439   TCP  ...   550    6.6M  1500   5
7 2020-07-04  20:09:38.753   28.266   TCP  ...    70   22642    40   5
8 2020-07-04  20:08:53.973   73.971   TCP  ...   223   86520    48   5
9 2020-07-04  20:09:18.406   58.013   TCP  ...   474    5.4M  1422   5

[10 rows x 12 columns]


In [None]:
print(df.columns)
print(pd.unique(df["SrcIPAddr:Port"]))



Index(['Date', 'first-seen', 'Duration', 'Proto', 'SrcIPAddr:Port',
       'DstIPAddr:Port', 'Packets', 'Bytes', 'pps', 'bps', 'Bpp', 'Day'],
      dtype='object')
['146.231.4.0:47837' '196.24.45.0:443' '146.230.0.0:6474'
 '197.102.66.0:443' '155.232.240.0:63576' '198.54.223.0:10285'
 '146.232.65.0:51128' '173.194.128.0:443' '155.232.240.0:443'
 '23.59.241.0:11891' '223.255.230.0:0' '196.24.45.0:42849'
 '198.54.223.0:11702' '196.24.45.0:80' '143.160.58.0:443'
 '196.21.118.0:33386' '196.23.168.0:443' '196.249.33.0:50266'
 '196.24.45.0:61855' '197.98.191.0:443' '192.96.15.0:51405'
 '196.21.109.0:17015' '146.231.24.0:59037' '196.21.242.0:56637'
 '196.21.157.0:33432' '155.232.7.0:41048' '196.24.45.0:0'
 '196.24.140.0:60101' '196.24.45.0:47435' '155.232.7.0:55518'
 '164.151.16.0:23018' '155.238.209.0:39010' '2.22.126.0:443'
 '23.54.159.0:443' '196.24.45.0:49233' '2.21.98.0:443' '130.211.203.0:443'
 '192.96.15.0:55340' '196.21.118.0:443' '209.204.232.0:80']


#***LSTM Implementation***

6. Split data into both training and test set. Use 80/20 split. 
   Decide later whether validation set or whether cross validation applied




In [10]:
# Function to shuffle the data. 
# Takes the data as input, split as a ratio (80% training), and shuffle is set to true since we want to randomize our sample

def split_data_into_training_testing(data, split, shuffle=True):
    idx = np.arange(len(data))
    if shuffle:
        np.random.shuffle(idx)

    p = int(len(data) * split)
    training_set = data[idx[:p]]
    test_set = data[idx[p:]]

    return training_set, test_set

In [None]:
split_data_into_training_testing(df, 0.8) # Don't need to add the true

7. Defining the Keras model configuaration

In [11]:
model = Sequential()
model.add(LSTM( 128 , dropout = 0.25, recurrent_dropout = 0.25)) #128 memory units
model.add(Dense(1, activation='sigmoid')) #Dense output layer, sigmoid activation
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(training_set, training_set, epochs=100, batch_size=1, verbose=2) #Idea will be to fit the features excluding target, and predict the target value

In [None]:
8. Model has been trained, prediction time

In [None]:
training_predictions = model.predict(training_set)
test_predictions = model.predict(testing_set)


9. Calculate prediction metrics

In [None]:
training_mse = np.square(np.subtract(training_set[0], training_predictions[:,0])) #We want to calculate the difference between our predicted and actual values
test_mse = np.square(np.subtract(test_set[0], test_predictions[:,0]))
print('Train Score: %.2f MSE' % (training_mse))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f MSE' % (test_mse))