## IMPORTANT!!

### From the root NetworkML directory, make sure you first run:
```
pip3 install .
pip3 install nest_asyncio
```

NOTE: nest_asyncio is a needed hack to run this in jupyterlab/notebook. Additionally, you will also need to download Wireshark/tshark in order to use this notebook and then ensure that tshark is in the file path.

In [1]:
# some initial setup
import sys

# let's set a path to a pcap (we'll use one included in the tests)
path = '../tests/test_data/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap'

# let's change the output so it's easy to find
output = 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz'

# set arguments for arg parse
sys.argv = ['pcap_to_csv.py', f'-o{output}', path]

In [2]:
# hack for jupyterlab since pyshark tries to take the main run loop
import nest_asyncio
nest_asyncio.apply()

# import the class for converting PCAPs to CSVs
from networkml.parsers.pcap_to_csv import PCAPToCSV
instance = PCAPToCSV()

# this will parse the args we specified above in sys.argv and run using them
instance.main()

# the will take a minute or so to run

INFO:networkml.parsers.pcap_to_csv:Processing ../tests/test_data/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap
INFO:networkml.parsers.pcap_to_csv:Finished ../tests/test_data/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap. 1/1 PCAPs done.
INFO:networkml.parsers.pcap_to_csv:GZipped CSV file(s) written out to: ['trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz']


''

In [3]:
# the output is a gzipped csv, so let's quickly pop that open and see what we have
# we're going to use DictReader from the CSV lib, so we'll get back a list of dictionaries, where the keys in the dicts are the fieldnames
# each dictionary in the list is a record (packet)

from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
rows = instance.get_rows('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz', 'both')
print(rows[0])



AttributeError: 'CSVToFeatures' object has no attribute 'get_rows'

In [4]:
# we have a gzipped csv with all of the fields we could extract at the 'packet' level
# (we could have supplied an arg above to do a different level, like 'flow')
# we can now take that file and reduce or change or add which fields should be included using the featurizer

# let's set a path to a gzipped csv (we'll use one we just made)
path = 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz'

# let's change the output so it's easy to find
output = 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.features.csv.gz'

# we need to specify where the featurizer functions are
features_path = '../networkml/featurizers/funcs'

# set arguments for arg parse
sys.argv = ['csv_to_features.py', f'-o{output}', f'-p{features_path}', path]

from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
instance.main()


INFO:networkml.featurizers.csv_to_features:Importing trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz size 15717973
INFO:networkml.featurizers.csv_to_features:Featurizing trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz


Importing class: Host
Importing class: SessionHost
Importing class: Generic
Importing class: Flow
Importing class: Packet
running Host/host_tshark_all...calculating intermediates.9 MACs, 1 sessions.MAC 00:04:00:81:81:d0 has minimum number of source IPs, selected as canonical source.MAC 1/1 100%.

INFO:networkml.featurizers.csv_to_features:Finished trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz. 1/1 CSVs done.
INFO:networkml.featurizers.csv_to_features:GZipped CSV file(s) written out to: ['trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.features.csv.gz']


1s


''

In [5]:
# the output is a gzipped csv again, so let's quickly pop that open and see what we have
# we're going to use DictReader from the CSV lib, so we'll get back a list of dictionaries, where the keys in the dicts are the fieldnames
# each dictionary in the list is a record (packet)

from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
rows = instance.get_rows('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.features.csv.gz', 'both')
print(rows[0])

AttributeError: 'CSVToFeatures' object has no attribute 'get_rows'

### Writing New Featuzier Functions
now that we can see how to turn PCAPs into CSVs and then use the featurizer to change the columns and make a new CSV
let's now look at how to write new featurizer functions and use them here's an example of an already included class with functions.

In [7]:
with open('../networkml/featurizers/funcs/flow.py', 'r') as f:
    for line in f:
        print(line)

from networkml.featurizers.features import Features



class Flow(Features):



    def default_tcp_5tuple(self, rows):

        fields = ['ip.src_host', 'ip.dst_host', 'tcp.dstport', 'tcp.srcport', 'frame.protocols']

        return self.get_columns(fields, rows)



    def default_udp_5tuple(self, rows):

        fields = ['ip.src_host', 'ip.dst_host', 'udp.dstport', 'udp.srcport', 'frame.protocols']

        return self.get_columns(fields, rows)



In [8]:
# So simply use an existing python file and class already in the `funcs` directory
# or create a new one, and make sure the class subclasses `Features`
# and the function signatures take in rows and return rows
# the above example as a helper function `get_columns` that lets you provide a list of fields and the rows
# and returns the rows with only those fields

# You don't need to use the helper function, but the returns rows should be a list of dictionaries
# just like the input of rows is (the same thing you get back from a CSV DictReader)

# Here's a sample to test:
from networkml.featurizers.csv_to_features import CSVToFeatures
instance = CSVToFeatures()
rows = instance.get_rows('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz', 'both')
print(rows[0])

AttributeError: 'CSVToFeatures' object has no attribute 'get_rows'

In [9]:
from networkml.featurizers.features import Features

class Flow(Features):

    def example_simple(self, rows):
        fields = ['layers', 'eth.src.oui_resolved', 'eth.dst.oui_resolved']
        return self.get_columns(fields, rows)

flow = Flow()
new_rows = flow.example_simple(rows)
print(new_rows[0])

NameError: name 'rows' is not defined

In [10]:
# great, but how did we know the fields?
rows[0].keys()

NameError: name 'rows' is not defined

In [11]:
# what if we want to create a new field/column or reduce the number of records/rows?
from networkml.featurizers.features import Features

class NewColumn(Features):

    def example_modify(self, rows):
        # reduce rows first as needed
        fields = ['layers', 'eth.src.oui_resolved', 'eth.dst.oui_resolved', 'ip.src', 'ip.dst']
        rows = self.get_columns(fields, rows)
        
        # create new columns using existing column info
        last_layer = 'Last Layer'
        combined_ips = 'Combined IPs'
        # each row is a dict
        for row in rows:
            # not all rows are guaranteed to have 'layers'
            if 'layers' in row:  
                # get the last element in the stringified list and clean it up
                row[last_layer] = row['layers'].split('<')[-1][:-2].split()[0]
            
            # not all rows are guaranteed to have 'ip.src' and ip.dst
            if 'ip.src' in row and 'ip.dst' in row:
                # combine two fields with a colon, making a new field
                row[combined_ips] = row['ip.src']+':'+row['ip.dst']
                # remove ip.src and ip.dst now that we have them
                del row['ip.src']
                del row['ip.dst']
        
        # remove duplicate rows
        rows = [dict(t) for t in {tuple(d.items()) for d in rows}]
        return rows

print(f'Number of original records: {len(rows)}\n')
ncol = NewColumn()
new_rows = ncol.example_modify(rows)
print(f'Fields/Values in first record (row): {new_rows[0]}\n')
print(f'Number of records after deduplication: {len(new_rows)}\n')

NameError: name 'rows' is not defined