### IMPORTANT!!

### from the root NetworkML directory, make sure you run `pip3 install .` first
### you will also need to run `pip3 install nest_asyncio` as it's a needed hack to run this in jupyterlab

In [2]:
# some initial setup
import sys

# let's set a path to a pcap (we'll use one included in the tests)
path = '../tests/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap'

# let's change the output so it's easy to find
output = 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz'

# set arguments for arg parse
sys.argv = ['pcap_to_csv.py', f'-o{output}', path]

In [3]:
# hack for jupyterlab since pyshark tries to take the main run loop
import nest_asyncio
nest_asyncio.apply()

# import the class for converting PCAPs to CSVs
from networkml.parsers.pcap_to_csv import PCAPToCSV
instance = PCAPToCSV()

# this will parse the args we specified above in sys.argv and run using them
instance.main()

# the will take a minute or so to run

INFO:networkml.parsers.pcap_to_csv:Including the following layers in CSV (if they exist): ['<IP Layer>', '<ETH Layer>', '<TCP Layer>', '<UDP Layer>', '<ICMP Layer>', '<ICMPv6 Layer>', '<DNS Layer>', '<DHCP Layer>', '<DHCPv6 Layer>', '<ARP Layer>', '<IP6 Layer>', '<TLS Layer>']
INFO:networkml.parsers.pcap_to_csv:GZipped CSV file(s) written out to: ['trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz']


In [5]:
# the output is a gzipped csv, so let's quickly pop that open and see what we have
# we're going to use DictReader from the CSV lib, so we'll get back a list of dictionaries, where the keys in the dicts are the fieldnames
# each dictionary in the list is a record (packet)

from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
rows = instance.get_rows('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz')
print(rows[0])



{'icmp.seq_le': '', 'tcp.options.nop': '', 'tcp.options.eol_tree': '', 'tcp.hdr_len': '20', 'arp.proto.size_raw': '', 'tcp.window_size_scalefactor_raw': "['3fa0', 48, 2, 0, 15]", 'eth.type_raw': "['0800', 12, 2, 0, 5]", 'dhcp.file_raw': '', 'eth.dst.oui_resolved': 'Sophos Ltd', 'eth.padding_raw': '', 'http.request': '', 'arp.dst.proto_ipv4': '', 'tcp.analysis.bytes_in_flight': '943', 'dhcp.ip.your': '', 'tls.record.content_type_raw': '', 'tcp.window_size': '16288', 'ip.flags_tree': '', 'eth.addr.oui_resolved_raw': "['4061869af1f5', 6, 6, 0, 26]", 'tcp.analysis.duplicate_ack_num': '', 'tls.handshake': '', 'M-SEARCH * HTTP/1.1\\r\\n': '', 'icmp.checksum_raw': '', 'arp.proto.type_raw': '', 'tcp.seq_raw': '2542412440', 'dns.count.auth_rr': '', 'tcp.stream': '0', 'ip.flags.mf_raw': "['0', 20, 2, 8192, 2]", 'dhcp.hw.mac_addr': '', 'http.request.line_raw': '', 'ip.flags.df': '1', 'eth.addr': '40:61:86:9a:f1:f5', 'tcp.flags.push': '1', 'tls.record.content_type': '', 'tcp.flags.ecn': '0', 'arp.

In [7]:
# we have a gzipped csv with all of the fields we could extract at the 'packet' level
# (we could have supplied an arg above to do a different level, like 'flow')
# we can now take that file and reduce or change or add which fields should be included using the featurizer

# let's set a path to a gzipped csv (we'll use one we just made)
path = 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz'

# let's change the output so it's easy to find
output = 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.features.csv.gz'

# we need to specify where the featurizer functions are
features_path = '../networkml/featurizers/funcs'

# set arguments for arg parse
sys.argv = ['csv_to_features.py', f'-o{output}', f'-p{features_path}', path]

from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
instance.main()


Importing class: Host
Importing class: Generic
Importing class: Flow
Importing class: Packet
Running method: Flow/default_tcp_5tuple
Running method: Flow/default_udp_5tuple


INFO:networkml.featurizers.csv_to_features:GZipped CSV file(s) written out to: ['trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.features.csv.gz']


In [8]:
# the output is a gzipped csv again, so let's quickly pop that open and see what we have
# we're going to use DictReader from the CSV lib, so we'll get back a list of dictionaries, where the keys in the dicts are the fieldnames
# each dictionary in the list is a record (packet)

from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
rows = instance.get_rows('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.features.csv.gz')
print(rows[0])

{'udp.dstport': '', 'udp.srcport': '', 'ip.dst_host': '72.14.213.138', 'tcp.srcport': '57011', 'tcp.dstport': '80', 'ip.src_host': '192.168.3.131', 'frame.protocols': 'eth:ethertype:ip:tcp:http'}
