# Handling Networking Data

This notebook shows how to use IntelligentElement to handle nested networking data. 

The example uses captures obtained using CISCO Joy - https://github.com/cisco/joy
You can either use Joy to capture network traffic directly or extract information from PCAPS files using Joy.

## Obtain data from file

The first step is to read Joy file, a gzip compresses list of dictionaries.

In [1]:
import json
import gzip
import struct

def GzipFileSize(filename):
    '''
    Auxiliary function that returns the size of a Gzip file.
    filename - Path to the Gzip file
    returns - Uncompressed size of Gzip file in bytes
    ''' 
    fo = open(filename, 'rb')
    fo.seek(-4, 2)
    r = fo.read()
    fo.close()
    return struct.unpack('<I', r)[0]
def LoadJoyJson(filename):
    '''
    Loads a .gzip file extracted from PCAP using CISCO's Joy tool.
    filename - Path to the Gzip file
    returns - List of dictionaries. Each entry corresponds to a flow capture in Joy file.
    '''
    fileData = []
    totalsize = GzipFileSize(filename)
    
    i=0
    #pbar = tqdm(total= (totalsize>>10) )
        
    with gzip.open(filename) as infile: #we do not want to load the entire file into memory
        iterFile = iter(infile)
        headerData = json.loads(next(iterFile).decode('utf-8').replace('\\',''))
        for bline in iterFile:
            #pbar.update( (infile.tell()>>10) - pbar.n )
            
            i=i+1
            line = bline.decode('utf-8')
            try:
                dj = json.loads(line.replace('\\',''))
                fileData.append(dj)
            except:
                print('Problem loading JSON line')

    return fileData

In [2]:
files = ['PCData/news/newsFolha.gz', 'PCData/socialmedia/logFacebook.gz', 'PCData/videostreaming/logYoutube.gz', 'PCData/videostreaming/logYoutube3.gz']
net_data = []
labels = []
for f in files:
    data = LoadJoyJson(f)
    net_data += data
    if 'news' in f:
        labels += [0]*len(data)
    elif 'socialmedia' in f:
        labels += [1]*len(data)
    elif 'videostreaming' in f:
        labels += [2]*len(data)

In [3]:
#view data from a single flow
net_data[0]

{'sa': '10.6.11.66',
 'da': '200.147.4.47',
 'pr': 6,
 'sp': 51697,
 'dp': 443,
 'bytes_out': 865,
 'num_pkts_out': 55,
 'bytes_in': 182540,
 'num_pkts_in': 126,
 'time_start': 1531230417.756095,
 'time_end': 1531230419.879366,
 'packets': [{'b': 441, 'dir': '>', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 257},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1480, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1480, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 249},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 2},
  {'b': 1480, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1480, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir': '<', 'ipt': 0},
  {'b': 1460, 'dir

# Build Model

We will showcase a simple model that will handle:

- bytes_out, bytes_in, num_pkts_out, num_pkts_in, time_end-time_start in the first level;
- packets as a nested list of dictionaries
- ip with nested out and in, with ttl as a number and id as list

In [4]:
from keras import Model
from keras import layers as L
import numpy as np

#add two parent levels to path
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
sys.path.insert(0,parentdir) 

%load_ext autoreload
%autoreload 2

import IntelligentElement as IE

Using TensorFlow backend.


## Packets Data

In [9]:
def get_pkt_info(x):
    pkt_data = x.get('packets', [{'b': 0, 'dir': '>', 'ipt': 0}])
    return np.array([[y['b'], int(y['dir']=='>'), y['ipt']] for y in pkt_data])

pkt_data  = [get_pkt_info(x) for x in net_data]

In [10]:
pkt_shape = (None, 3)

inp=L.Input(pkt_shape)
x  =L.CuDNNLSTM(16)(inp)
pkt_model = Model(inputs=inp, outputs=x)

print('Original model')
pkt_model.summary()

pkt_ie = IE.IntelligentElement(pkt_data, pkt_model, pkt_shape, name='pkt_ie')

m, ii, oo = pkt_ie.retrieve_model_inputs_outputs()
print('\n\nRetrieved model')
m.summary()

Original model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None, 3)           0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 16)                1344      
Total params: 1,344
Trainable params: 1,344
Non-trainable params: 0
_________________________________________________________________


Retrieved model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inp_pkt_ie (InputLayer)      (None, None, 3)           0         
_________________________________________________________________
m_pkt_ie (Model)             (None, 16)                1344      
Total params: 1,344
Trainable params: 1,344
Non-trainable params: 0
_________________________________________________________________


In [14]:
pkt_ie.get_batch([0,1,2,3])[0].shape

(4, 100, 3)

In [17]:
len(net_data[3]['packets'])

1

In [20]:
print(pkt_data[0][0:5])
pkt_ie.get_batch([0,1,2,3])[0][0][0:5]

[[ 441    1    0]
 [1460    0  257]
 [1460    0    0]
 [1460    0    0]
 [1460    0    0]]


array([[4.41e+02, 1.00e+00, 0.00e+00],
       [1.46e+03, 0.00e+00, 2.57e+02],
       [1.46e+03, 0.00e+00, 0.00e+00],
       [1.46e+03, 0.00e+00, 0.00e+00],
       [1.46e+03, 0.00e+00, 0.00e+00]])

## Root Data

In [14]:
def get_root_info(x):
    return [x.get('bytes_in', 0), x.get('bytes_out', 0), x.get('num_pkts_in', 0), x.get('num_pkts_out', 0), 
            x['time_end']-x['time_start']]
root_data = [get_root_info(x) for x in net_data]
root_shape = (5,)

inp=L.Input( (root_shape[-1]+pkt_ie.model.output_shape[-1],) )
x=inp
for kk in range(3):
    x=L.Dense(10, activation='relu')(x)
x=L.Dense(3,activation='softmax')(x)
root_model = Model(inputs=inp, outputs=x)

print('Original model')
root_model.summary()

root_ie = IE.IntelligentElement(root_data, root_model, root_shape, children_ie=[pkt_ie], name='root_ie')

m, ii, oo = root_ie.retrieve_model_inputs_outputs()
print('\n\nRetrieved model')
m.summary()

Original model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 21)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                220       
_________________________________________________________________
dense_6 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_7 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 33        
Total params: 473
Trainable params: 473
Non-trainable params: 0
_________________________________________________________________


Retrieved model
__________________________________________________________________________________________________
Layer (type)

In [6]:
root_data[0:5]

[[182540, 865, 126, 55, 2.1232709884643555],
 [243, 52771, 30, 44, 10.797608852386475],
 [11938, 8627637, 3568, 5942, 16.691659212112427],
 [0, 19, 0, 1, 0.0],
 [0, 19, 0, 1, 0.0]]