## IMPORTANT!!

### From the root NetworkML directory, make sure you first run:
```
pip3 install .
pip3 install nest_asyncio
```

NOTE: nest_asyncio is a needed hack to run this in jupyterlab/notebook


In [5]:
# some initial setup
import sys

# let's set a path to a pcap (we'll use one included in the tests)
path = '../tests/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap'

# let's change the output so it's easy to find
output = 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz'

# set arguments for arg parse
sys.argv = ['pcap_to_csv.py', f'-o{output}', path]

In [6]:
# hack for jupyterlab since pyshark tries to take the main run loop
import nest_asyncio
nest_asyncio.apply()

# import the class for converting PCAPs to CSVs
from networkml.parsers.pcap_to_csv import PCAPToCSV
instance = PCAPToCSV()

# this will parse the args we specified above in sys.argv and run using them
instance.main()

# the will take a minute or so to run

INFO:networkml.parsers.pcap_to_csv:Including the following layers in CSV (if they exist): ['<IP Layer>', '<ETH Layer>', '<TCP Layer>', '<UDP Layer>', '<ICMP Layer>', '<ICMPv6 Layer>', '<DNS Layer>', '<DHCP Layer>', '<DHCPv6 Layer>', '<ARP Layer>', '<IP6 Layer>', '<TLS Layer>']
INFO:networkml.parsers.pcap_to_csv:GZipped CSV file(s) written out to: ['trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz']


In [7]:
# the output is a gzipped csv, so let's quickly pop that open and see what we have
# we're going to use DictReader from the CSV lib, so we'll get back a list of dictionaries, where the keys in the dicts are the fieldnames
# each dictionary in the list is a record (packet)

from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
rows = instance.get_rows('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz')
print(rows[0])



{'udp.srcport': '', 'layers': '[<ETH Layer>, <IP Layer>, <TCP Layer>, <HTTP Layer>, <FRAME_RAW Layer>, <ETH_RAW Layer>, <IP_RAW Layer>, <TCP_RAW Layer>, <HTTP_RAW Layer>]', 'http.request.line_raw': '', 'dhcp.ip.your_raw': '', 'arp.src.proto_ipv4_raw': '', 'dhcp.flags.reserved': '', 'http.request': '', 'dns.count.add_rr': '', 'ip.flags.df_raw': "['1', 20, 2, 16384, 2]", 'tls.record': '', 'dns.id_raw': '', 'frame.encap_type': '1', 'tcp.time_relative': '0.000000000', 'tls.handshake.session_id_length': '', 'dhcp.flags.reserved_raw': '', 'eth.dst.oui_raw': "['001a8c', 0, 3, 0, 6]", 'dhcp.ip.client_raw': '', 'eth.src': '40:61:86:9a:f1:f5', 'tls.record.length_raw': '', 'ip.ttl_raw': "['80', 22, 1, 0, 4]", 'tls.handshake.challenge_raw': '', 'dhcp.option.padding_raw': '', 'tcp.flags.str_raw': "['5018', 46, 2, 0, 26]", 'dhcp.hops_raw': '', 'dhcp.hw.len': '', 'tls.handshake_raw': '', 'ip.frag_offset_raw': "['0', 20, 2, 8191, 5]", 'tcp.urgent_pointer_raw': "['0000', 52, 2, 0, 5]", 'tls.handshake.t

In [8]:
# we have a gzipped csv with all of the fields we could extract at the 'packet' level
# (we could have supplied an arg above to do a different level, like 'flow')
# we can now take that file and reduce or change or add which fields should be included using the featurizer

# let's set a path to a gzipped csv (we'll use one we just made)
path = 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz'

# let's change the output so it's easy to find
output = 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.features.csv.gz'

# we need to specify where the featurizer functions are
features_path = '../networkml/featurizers/funcs'

# set arguments for arg parse
sys.argv = ['csv_to_features.py', f'-o{output}', f'-p{features_path}', path]

from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
instance.main()


Importing class: Host
Importing class: Generic
Importing class: Flow
Importing class: Packet
Running method: Flow/default_tcp_5tuple
Running method: Flow/default_udp_5tuple


INFO:networkml.featurizers.csv_to_features:GZipped CSV file(s) written out to: ['trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.features.csv.gz']


In [9]:
# the output is a gzipped csv again, so let's quickly pop that open and see what we have
# we're going to use DictReader from the CSV lib, so we'll get back a list of dictionaries, where the keys in the dicts are the fieldnames
# each dictionary in the list is a record (packet)

from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
rows = instance.get_rows('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.features.csv.gz')
print(rows[0])

{'ip.dst_host': '72.14.213.138', 'udp.srcport': '', 'udp.dstport': '', 'frame.protocols': 'eth:ethertype:ip:tcp:http', 'ip.src_host': '192.168.3.131', 'tcp.dstport': '80', 'tcp.srcport': '57011'}


### Writing New Featuzier Functions
now that we can see how to turn PCAPs into CSVs and then use the featurizer to change the columns and make a new CSVlet's now look at how to write new featurizer functions and use them here's an example of an already included class with functions.

In [10]:
with open('../networkml/featurizers/funcs/flow.py', 'r') as f:
    for line in f:
        print(line)

from networkml.featurizers.features import Features



class Flow(Features):



    def default_tcp_5tuple(self, rows):

        fields = ['ip.src_host', 'ip.dst_host', 'tcp.dstport', 'tcp.srcport', 'frame.protocols']

        return self.get_columns(fields, rows)



    def default_udp_5tuple(self, rows):

        fields = ['ip.src_host', 'ip.dst_host', 'udp.dstport', 'udp.srcport', 'frame.protocols']

        return self.get_columns(fields, rows)



In [11]:
# So simply use an existing python file and class already in the `funcs` directory
# or create a new one, and make sure the class subclasses `Features`
# and the function signatures take in rows and return rows
# the above example as a helper function `get_columns` that lets you provide a list of fields and the rows
# and returns the rows with only those fields

# You don't need to use the helper function, but the returns rows should be a list of dictionaries
# just like the input of rows is (the same thing you get back from a CSV DictReader)

# Here's a sample to test:
from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
rows = instance.get_rows('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz')
print(rows[0])

{'udp.srcport': '', 'layers': '[<ETH Layer>, <IP Layer>, <TCP Layer>, <HTTP Layer>, <FRAME_RAW Layer>, <ETH_RAW Layer>, <IP_RAW Layer>, <TCP_RAW Layer>, <HTTP_RAW Layer>]', 'http.request.line_raw': '', 'dhcp.ip.your_raw': '', 'arp.src.proto_ipv4_raw': '', 'dhcp.flags.reserved': '', 'http.request': '', 'dns.count.add_rr': '', 'ip.flags.df_raw': "['1', 20, 2, 16384, 2]", 'tls.record': '', 'dns.id_raw': '', 'frame.encap_type': '1', 'tcp.time_relative': '0.000000000', 'tls.handshake.session_id_length': '', 'dhcp.flags.reserved_raw': '', 'eth.dst.oui_raw': "['001a8c', 0, 3, 0, 6]", 'dhcp.ip.client_raw': '', 'eth.src': '40:61:86:9a:f1:f5', 'tls.record.length_raw': '', 'ip.ttl_raw': "['80', 22, 1, 0, 4]", 'tls.handshake.challenge_raw': '', 'dhcp.option.padding_raw': '', 'tcp.flags.str_raw': "['5018', 46, 2, 0, 26]", 'dhcp.hops_raw': '', 'dhcp.hw.len': '', 'tls.handshake_raw': '', 'ip.frag_offset_raw': "['0', 20, 2, 8191, 5]", 'tcp.urgent_pointer_raw': "['0000', 52, 2, 0, 5]", 'tls.handshake.t

In [12]:
from networkml.featurizers.features import Features

class Flow(Features):

    def example_simple(self, rows):
        fields = ['layers', 'eth.src.oui_resolved', 'eth.dst.oui_resolved']
        return self.get_columns(fields, rows)

flow = Flow()
new_rows = flow.example_simple(rows)
print(new_rows[0])

{'layers': '[<ETH Layer>, <IP Layer>, <TCP Layer>, <HTTP Layer>, <FRAME_RAW Layer>, <ETH_RAW Layer>, <IP_RAW Layer>, <TCP_RAW Layer>, <HTTP_RAW Layer>]', 'eth.src.oui_resolved': "Micro-Star Int'L Co.,Ltd", 'eth.dst.oui_resolved': 'Sophos Ltd'}


In [13]:
# great, but how did we know the fields?
rows[0].keys()

dict_keys(['udp.srcport', 'layers', 'http.request.line_raw', 'dhcp.ip.your_raw', 'arp.src.proto_ipv4_raw', 'dhcp.flags.reserved', 'http.request', 'dns.count.add_rr', 'ip.flags.df_raw', 'tls.record', 'dns.id_raw', 'frame.encap_type', 'tcp.time_relative', 'tls.handshake.session_id_length', 'dhcp.flags.reserved_raw', 'eth.dst.oui_raw', 'dhcp.ip.client_raw', 'eth.src', 'tls.record.length_raw', 'ip.ttl_raw', 'tls.handshake.challenge_raw', 'dhcp.option.padding_raw', 'tcp.flags.str_raw', 'dhcp.hops_raw', 'dhcp.hw.len', 'tls.handshake_raw', 'ip.frag_offset_raw', 'tcp.urgent_pointer_raw', 'tls.handshake.type_raw', 'dhcp.option.type_tree', 'icmp.type_raw', 'dhcp.cookie_raw', 'ip.flags.rb', 'eth.dst.oui', 'tcp.options.sack_tree', 'arp.opcode_raw', 'ip.addr', 'dns.flags_tree', 'tcp.analysis.push_bytes_sent', 'dhcp.hw.len_raw', 'tcp.options.wscale', 'ip.ttl_tree', 'dhcp.id', 'ip.hdr_len', 'tcp.options.mss_tree', 'udp.checksum.status', 'udp.length_raw', 'tcp.port_raw', 'ip.src', 'tcp.flags.cwr_raw',

In [26]:
# what if we want to create a new field/column or reduce the number of records/rows?
from networkml.featurizers.features import Features

class NewColumn(Features):

    def example_modify(self, rows):
        # reduce rows first as needed
        fields = ['layers', 'eth.src.oui_resolved', 'eth.dst.oui_resolved', 'ip.src', 'ip.dst']
        rows = self.get_columns(fields, rows)
        
        # create new columns using existing column info
        last_layer = 'Last Layer'
        combined_ips = 'Combined IPs'
        # each row is a dict
        for row in rows:
            # not all rows are guaranteed to have 'layers'
            if 'layers' in row:  
                # get the last element in the stringified list and clean it up
                row[last_layer] = row['layers'].split('<')[-1][:-2].split()[0]
            
            # not all rows are guaranteed to have 'ip.src' and ip.dst
            if 'ip.src' in row and 'ip.dst' in row:
                # combine two fields with a colon, making a new field
                row[combined_ips] = row['ip.src']+':'+row['ip.dst']
        
        # remove duplicate rows
        rows = [dict(t) for t in {tuple(d.items()) for d in rows}]
        return rows

print(f'Number of original records: {len(rows)}\n')
ncol = NewColumn()
new_rows = ncol.example_modify(rows)
print(f'Fields/Values in first record (row): {new_rows[0]}\n')
print(f'Number of records after deduplication: {len(new_rows)}\n')

Number of original records: 14261

Fields/Values in first record (row): {'layers': '[<ETH Layer>, <IP Layer>, <UDP Layer>, <DATA Layer>, <FRAME_RAW Layer>, <ETH_RAW Layer>, <IP_RAW Layer>, <UDP_RAW Layer>, <DATA_RAW Layer>]', 'eth.src.oui_resolved': 'Sophos Ltd', 'eth.dst.oui_resolved': 'Quanta Computer Inc.', 'ip.src': '217.174.56.245', 'ip.dst': '172.16.255.1', 'Last Layer': 'DATA_RAW', 'Combined IPs': '217.174.56.245:172.16.255.1'}

Number of records after deduplication: 708

