# Bag-of-Words
In this notebook, we see how to transform the list of protocols (text) that Pyshark extracts from a packet into a numerical vector or an integer number

In [14]:
# Author: Roberto Doriguzzi-Corin
# Project: Course on Network Intrusion and Anomaly Detection with Machine Learning
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pyshark

# We need the following to get around “RuntimeError: This event loop is already running” when using Pyshark within Jupyter notebooks.
# Not needed in stand-alone Python projects
import nest_asyncio
nest_asyncio.apply()  

from sklearn.feature_extraction.text import CountVectorizer

# Path to the capture file
capture_file = './PCAPs/benign-syn.pcap'
cap = pyshark.FileCapture(capture_file)

# Build the vocabulary
You can define a vocabulary by analysing the packets of the dataset, or use a pre-defined vocabulary that contains only the protocols that are relevant for the application scenario.

In [18]:
# Select the vacabulary 
VOCABULARY = "static" # static or dataset

if VOCABULARY == "dataset":
    # Static list of protocols.
    bow = ['arp','data','dhcp','dns','eth','ftp','http','icmp','ip','ssdp','ssl','tcp','telnet','tls','udp']
else:
    bow = []
    for packet in cap:
        packet_protocols = str(packet.frame_info.protocols).split(':')
        bow.extend(packet_protocols)

    # remove duplicates and sort
    bow = list(set(bow))
    bow.sort()

print ("Dictionary from the dataset: ", bow) 

powers_of_two = np.array([2**i for i in range(len(bow))])

# Step 2: Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Step 3: Fit the vectorizer on the corpus and transform the documents into BoW vectors 
X = vectorizer.fit_transform(bow)

# X is now a sparse matrix representing the BoW vectors of the documents 
print(X.toarray())

# Step 4: Get the feature names (words in the vocabulary) 
feature_names = vectorizer.get_feature_names_out()

Dictionary from the dataset:  ['arp', 'data', 'dhcp', 'dns', 'eth', 'ftp', 'http', 'icmp', 'ip', 'ssdp', 'ssl', 'tcp', 'telnet', 'tls', 'udp']
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]


# Feature extraction and preprocessing
Now we use the bag-of-words to translate the list of protocols into a numerical vector or an integer number.

In [17]:
for packet in cap:
    print (packet.frame_info.protocols)
    protocols = vectorizer.transform([packet.frame_info.protocols]).toarray().tolist()[0]
    #protocols = [1 if i >= 1 else 0 for i in protocols]  # we do not want the protocols counted more than once (sometimes they are listed twice in pkt.frame_info.protocols)
    protocols_value = int(np.dot(np.array(protocols), powers_of_two))
    print ("Numerical vector: ",protocols)
    print ("Integer number: ", protocols_value)

eth:ethertype:ip:udp:dns
Numerical vector:  [0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]
Integer number:  2140
eth:ethertype:ip:tcp
Numerical vector:  [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]
Integer number:  600
eth:ethertype:ip:tcp
Numerical vector:  [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]
Integer number:  600
eth:ethertype:ip:tcp:tls
Numerical vector:  [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]
Integer number:  1624
eth:ethertype:ip:tcp
Numerical vector:  [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]
Integer number:  600
eth:ethertype:ip:tcp:tls
Numerical vector:  [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]
Integer number:  1624
eth:ethertype:ip:tcp
Numerical vector:  [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]
Integer number:  600
eth:ethertype:ip:tcp:tls
Numerical vector:  [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]
Integer number:  1624
eth:ethertype:ip:tcp
Numerical vector:  [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]
Integer number:  600
eth:ethertype:ip:tcp
Numerical vector:  [0, 