# Handling Networking Data

This notebook shows how to use IntelligentElement to handle nested networking data. 

The example uses captures obtained using CISCO Joy - https://github.com/cisco/joy
You can either use Joy to capture network traffic directly or extract information from PCAPS files using Joy.

## Obtain data from file

The first step is to read Joy file, a gzip compresses list of dictionaries.

In [1]:
import json
import gzip
import struct

def GzipFileSize(filename):
    '''
    Auxiliary function that returns the size of a Gzip file.
    filename - Path to the Gzip file
    returns - Uncompressed size of Gzip file in bytes
    ''' 
    fo = open(filename, 'rb')
    fo.seek(-4, 2)
    r = fo.read()
    fo.close()
    return struct.unpack('<I', r)[0]
def LoadJoyJson(filename):
    '''
    Loads a .gzip file extracted from PCAP using CISCO's Joy tool.
    filename - Path to the Gzip file
    returns - List of dictionaries. Each entry corresponds to a flow capture in Joy file.
    '''
    fileData = []
    totalsize = GzipFileSize(filename)
    
    i=0
    #pbar = tqdm(total= (totalsize>>10) )
        
    with gzip.open(filename) as infile: #we do not want to load the entire file into memory
        iterFile = iter(infile)
        headerData = json.loads(next(iterFile).decode('utf-8').replace('\\',''))
        for bline in iterFile:
            #pbar.update( (infile.tell()>>10) - pbar.n )
            
            i=i+1
            line = bline.decode('utf-8')
            try:
                dj = json.loads(line.replace('\\',''))
                fileData.append(dj)
            except:
                print('Problem loading JSON line')

    return fileData

In [2]:
files = ['PCData/news/newsFolha.gz', 'PCData/socialmedia/logFacebook.gz', 'PCData/videostreaming/logYoutube.gz', 'PCData/videostreaming/logYoutube3.gz']
net_data = []
labels = []
for f in files:
    data = LoadJoyJson(f)
    net_data += data
    if 'news' in f:
        labels += [0]*len(data)
    elif 'socialmedia' in f:
        labels += [1]*len(data)
    elif 'videostreaming' in f:
        labels += [2]*len(data)

In [3]:
#view data from a single flow
net_data[0]

{'sa': '10.6.11.66',
 'da': '200.147.4.47',
 'pr': 6,
 'sp': 51697,
 'dp': 443,
 'bytes_out': 865,
 'num_pkts_out': 55,
 'bytes_in': 182540,
 'num_pkts_in': 126,
 'time_start': 1531230417.756095,
 'time_end': 1531230419.879366,
 'packets': [{'b': 441, 'dir': '>', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 257},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1480, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1480, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 249},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 2},
  {'b': 1480, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1480, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir

# Build Model

We will showcase a simple model that will handle:

- bytes_out, bytes_in, num_pkts_out, num_pkts_in, time_end-time_start in the first level;
- packets as a nested list of dictionaries
- ip with nested out and in, with ttl as a number and id as list

In [35]:
from keras import Model
from keras import layers as L
import numpy as np

#add two parent levels to path
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
sys.path.insert(0,parentdir) 

%load_ext autoreload
%autoreload 2

import IntelligentElement as IE

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## IP

We handle 'in' and 'out separately'.

In [36]:
def get_ip_out_info(x):
    ip_out_data = x.get('ip', {'out': {'ttl': 0,'id': []}, 'in': {'ttl': 0,'id': []}})
    
    return np.array([ip_out_data['out']['ttl']]).astype(int)

ip_out_data = [get_ip_out_info(x) for x in net_data]
ip_out_shape = (1,)

inp=L.Input(ip_out_shape)
x  =L.Embedding(256, 8)(inp)
x  =L.Flatten()(x)
ip_out_model = Model(inputs=inp, outputs=x)

print('Original model')
ip_out_model.summary()

ip_out_ie = IE.IntelligentElement(ip_out_data, ip_out_model, ip_out_shape, name='ip_out_ie')

m, ii, oo = ip_out_ie.retrieve_model_inputs_outputs()
print('\n\nRetrieved model')
m.summary()

Original model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1, 8)              2048      
_________________________________________________________________
flatten_2 (Flatten)          (None, 8)                 0         
Total params: 2,048
Trainable params: 2,048
Non-trainable params: 0
_________________________________________________________________


Retrieved model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inp_ip_out_ie (InputLayer)   (None, 1)                 0         
_________________________________________________________________
m_ip_out_ie (Model)          (None, 8)                 2048      
Total params: 2,048
Trainable params: 2,0

In [37]:
def get_ip_in_info(x):
    ip_in_data = x.get('ip', {'out': {'ttl': 0,'id': []}, 'in': {'ttl': 0,'id': []}})
    if not 'in' in ip_in_data:
        ip_in_data['in'] = {'ttl': 0,'id': []}
        
    return np.array([ip_in_data['in']['ttl']]).astype(int)

ip_in_data = [get_ip_in_info(x) for x in net_data]

#share model with ip_out
print('Original model')
ip_out_model.summary()

ip_in_ie = IE.IntelligentElement(ip_in_data, ip_out_model, ip_out_shape, name='ip_in_ie')

m, ii, oo = ip_in_ie.retrieve_model_inputs_outputs()
print('\n\nRetrieved model')
m.summary()

Original model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1, 8)              2048      
_________________________________________________________________
flatten_2 (Flatten)          (None, 8)                 0         
Total params: 2,048
Trainable params: 2,048
Non-trainable params: 0
_________________________________________________________________


Retrieved model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inp_ip_in_ie (InputLayer)    (None, 1)                 0         
_________________________________________________________________
m_ip_in_ie (Model)           (None, 8)                 2048      
Total params: 2,048
Trainable params: 2,0

In [38]:
def get_ip_id_out_info(x):
    ip_id_out_data = x.get('ip', {'out': {'ttl': 0,'id': []}, 'in': {'ttl': 0,'id': []}})
    
    ans = [[x] for x in ip_id_out_data['out']['id']]
    if len(ans)==0:
        ans=[[0]]
    return np.array(ans).astype(int)

ip_id_out_data = [get_ip_id_out_info(x) for x in net_data]

ip_id_out_shape = (None, 1)

inp=L.Input(ip_id_out_shape)
x  =L.CuDNNLSTM(8)(inp)
ip_id_out_model = Model(inputs=inp, outputs=x)

print('Original model')
ip_id_out_model.summary()

ip_id_out_ie = IE.IntelligentElement(ip_id_out_data, ip_id_out_model, ip_id_out_shape, name='ip_id_out_ie')

m, ii, oo = ip_id_out_ie.retrieve_model_inputs_outputs()
print('\n\nRetrieved model')
m.summary()

Original model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, None, 1)           0         
_________________________________________________________________
cu_dnnlstm_4 (CuDNNLSTM)     (None, 8)                 352       
Total params: 352
Trainable params: 352
Non-trainable params: 0
_________________________________________________________________


Retrieved model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inp_ip_id_out_ie (InputLayer (None, None, 1)           0         
_________________________________________________________________
m_ip_id_out_ie (Model)       (None, 8)                 352       
Total params: 352
Trainable params: 352
Non-trainable params: 0
_________________________________________________________________


In [39]:
ip_id_out_ie.get_batch([0,1,2,3])[0].shape

(4, 50, 1)

In [41]:
def get_ip_id_in_info(x):
    ip_id_in_data = x.get('ip', {'out': {'ttl': 0,'id': []}, 'in': {'ttl': 0,'id': []}})
    if not 'in' in ip_id_in_data:
        ip_id_in_data['in'] = {'ttl': 0,'id': []}
    
    ans = [[x] for x in ip_id_in_data['in']['id']]
    if len(ans)==0:
        ans=[[0]]
    return np.array(ans).astype(int)

ip_id_in_data = [get_ip_id_in_info(x) for x in net_data]

#share model
print('Original model')
ip_id_out_model.summary()

ip_id_in_ie = IE.IntelligentElement(ip_id_in_data, ip_id_out_model, ip_id_out_shape, name='ip_id_in_ie')

m, ii, oo = ip_id_in_ie.retrieve_model_inputs_outputs()
print('\n\nRetrieved model')
m.summary()

Original model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, None, 1)           0         
_________________________________________________________________
cu_dnnlstm_4 (CuDNNLSTM)     (None, 8)                 352       
Total params: 352
Trainable params: 352
Non-trainable params: 0
_________________________________________________________________


Retrieved model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inp_ip_id_in_ie (InputLayer) (None, None, 1)           0         
_________________________________________________________________
m_ip_id_in_ie (Model)        (None, 8)                 352       
Total params: 352
Trainable params: 352
Non-trainable params: 0
_________________________________________________________________


## Packets Data

In [55]:
def get_pkt_info(x):
    pkt_data = x.get('packets', [{'b': 0, 'dir': '>', 'ipt': 0}])
    
    ans = [[y['b'], int(y['dir']=='>'), y['ipt']] for y in pkt_data]
    if len(ans) == 0:
        ans = [[0,0,0]]
    
    return np.array(ans)

pkt_data  = [get_pkt_info(x) for x in net_data]
pkt_shape = (None, 3)

inp=L.Input(pkt_shape)
x  =L.CuDNNLSTM(16)(inp)
pkt_model = Model(inputs=inp, outputs=x)

print('Original model')
pkt_model.summary()

pkt_ie = IE.IntelligentElement(pkt_data, pkt_model, pkt_shape, name='pkt_ie')

m, ii, oo = pkt_ie.retrieve_model_inputs_outputs()
print('\n\nRetrieved model')
m.summary()

Original model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, None, 3)           0         
_________________________________________________________________
cu_dnnlstm_6 (CuDNNLSTM)     (None, 16)                1344      
Total params: 1,344
Trainable params: 1,344
Non-trainable params: 0
_________________________________________________________________


Retrieved model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inp_pkt_ie (InputLayer)      (None, None, 3)           0         
_________________________________________________________________
m_pkt_ie (Model)             (None, 16)                1344      
Total params: 1,344
Trainable params: 1,344
Non-trainable params: 0
_________________________________________________________________


In [56]:
pkt_ie.get_batch([0,1,2,3])[0].shape

(4, 100, 3)

In [57]:
len(net_data[3]['packets'])

1

In [58]:
print(pkt_data[0][0:5])
pkt_ie.get_batch([0,1,2,3])[0][0][0:5]

[[ 441    1    0]
 [1460    0  257]
 [1460    0    0]
 [1460    0    0]
 [1460    0    0]]


array([[4.41e+02, 1.00e+00, 0.00e+00],
       [1.46e+03, 0.00e+00, 2.57e+02],
       [1.46e+03, 0.00e+00, 0.00e+00],
       [1.46e+03, 0.00e+00, 0.00e+00],
       [1.46e+03, 0.00e+00, 0.00e+00]])

## Root Data

In [59]:
def get_root_info(x):
    return np.array([x.get('bytes_in', 0), x.get('bytes_out', 0), x.get('num_pkts_in', 0), x.get('num_pkts_out', 0), 
            x['time_end']-x['time_start']])
root_data = [get_root_info(x) for x in net_data]
root_shape = (5,)

children = [pkt_ie, ip_out_ie, ip_id_out_ie, ip_in_ie, ip_id_in_ie]
inp=L.Input( (root_shape[-1]+IE.get_children_sum_last_output_shapes(children),) )
x=inp
for kk in range(3):
    x=L.Dense(10, activation='relu')(x)
x=L.Dense(3,activation='softmax')(x)
root_model = Model(inputs=inp, outputs=x)

print('Original model')
root_model.summary()

root_ie = IE.IntelligentElement(root_data, root_model, root_shape, children_ie=children, name='root_ie')

m, ii, oo = root_ie.retrieve_model_inputs_outputs()
print('\n\nRetrieved model')
m.summary()

Original model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 53)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                540       
_________________________________________________________________
dense_10 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_11 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 33        
Total params: 793
Trainable params: 793
Non-trainable params: 0
_________________________________________________________________


Retrieved model
__________________________________________________________________________________________________
Layer (type)

In [60]:
root_data[0:5]

[array([1.82540000e+05, 8.65000000e+02, 1.26000000e+02, 5.50000000e+01,
        2.12327099e+00]),
 array([2.43000000e+02, 5.27710000e+04, 3.00000000e+01, 4.40000000e+01,
        1.07976089e+01]),
 array([1.19380000e+04, 8.62763700e+06, 3.56800000e+03, 5.94200000e+03,
        1.66916592e+01]),
 array([ 0., 19.,  0.,  1.,  0.]),
 array([ 0., 19.,  0.,  1.,  0.])]

In [61]:
root_ie.get_batch([0,1,2])[3].shape

(3, 1)

# Build Generator

In [74]:
from keras.utils import Sequence

class IEDataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, ie, labels, batch_size=128, shuffle=True):
        
        'Initialization'
        self.ie = ie
        
        self.batch_size = batch_size
        self.labels = labels
        self.nsamples = len(ie.data)
        assert self.nsamples == len(labels), 'Length of labels must match length of data'
        
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(self.nsamples / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.nsamples)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        y = np.empty((self.batch_size), dtype=int)

        #store samples
        X = self.ie.get_batch(list_IDs_temp)
        
        # Generate data
        for i, ID in enumerate(list_IDs_temp):

            # Store class
            y[i] = self.labels[ID]

        return X, y

In [75]:
ie_datagen = IEDataGenerator(root_ie, np.expand_dims(labels,axis=1))

In [76]:
xx,yy = ie_datagen.__getitem__(0)

In [77]:
root_model, ii, oo = root_ie.retrieve_model_inputs_outputs()
root_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inp_pkt_ie (InputLayer)         (None, None, 3)      0                                            
__________________________________________________________________________________________________
inp_ip_out_ie (InputLayer)      (None, 1)            0                                            
__________________________________________________________________________________________________
inp_ip_id_out_ie (InputLayer)   (None, None, 1)      0                                            
__________________________________________________________________________________________________
inp_ip_in_ie (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
inp_ip_id_

In [82]:
root_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

In [85]:
from keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='sparse_categorical_accuracy', factor=0.2,
                              patience=5, min_lr=1e-5, verbose=1)

result = root_model.fit_generator(ie_datagen, epochs=10, callbacks=[reduce_lr])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x219bb858e48>