In [None]:
!mkdir -p /etc/apt/keyrings; mkdir -p ~/.gnupg; chmod 700 ~/.gnupg
!gpg --no-default-keyring --keyring /etc/apt/keyrings/skewed.de.gpg --keyserver keyserver.ubuntu.com --recv-keys 612DEFB798507F25
!echo "deb [signed-by=/etc/apt/keyrings/skewed.de.gpg] https://downloads.skewed.de/apt $(lsb_release -s -c) main" > /etc/apt/sources.list.d/skewed.list
!apt-get update
!apt-get install python3-graph-tool python3-matplotlib python3-cairo

gpg: keybox '/etc/apt/keyrings/skewed.de.gpg' created
gpg: /root/.gnupg/trustdb.gpg: trustdb created
gpg: key 612DEFB798507F25: public key "Tiago de Paula Peixoto <tiago@skewed.de>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [802 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 https://downloads.skewed.de/apt jammy InRelease [7,535 B]
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:8 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:9 https://downloads.skewed.de/apt jammy/main amd64 Packages [3,167 B]
Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/ma

In [None]:
# Colab uses a Python install that deviates from the system's! Bad colab! We need some workarounds.
!apt purge python3-cairo
!apt install libcairo2-dev pkg-config python3-dev
!pip install --force-reinstall pycairo
!pip install zstandard

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following packages will be REMOVED:
  python3-cairo*
0 upgraded, 0 newly installed, 1 to remove and 45 not upgraded.
After this operation, 310 kB disk space will be freed.
(Reading database ... 130125 files and directories currently installed.)
Removing python3-cairo:amd64 (1.20.1-3build1) ...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
pkg-config is already the newest version (0.29.2-1ubuntu3).
python3-dev is already the newest version (3.10.6-1~22.04).
python3-dev set to manually installed.
The following additional packages will be installed:
  libblkid-dev libblkid1 libcairo-script-interpreter2 libffi-dev libglib2.0-dev libglib2.0-dev-bin
  libice-dev liblzo2-2 libmount-dev libmount1 libpixman-1-dev libselinux1-dev libsepol-dev
  libsm-dev libxcb-render0-dev libxcb-shm0-dev
Suggested packages:
  libcairo2-doc libgirepository1.0-dev

In [None]:
!pip install scapy

Collecting scapy
  Downloading scapy-2.5.0.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scapy
  Building wheel for scapy (setup.py) ... [?25l[?25hdone
  Created wheel for scapy: filename=scapy-2.5.0-py2.py3-none-any.whl size=1444327 sha256=5c37b07f035d1487a4e0343c7b9ad8077072f88dd5e71b71ea11cd5817f6d608
  Stored in directory: /root/.cache/pip/wheels/82/b7/03/8344d8cf6695624746311bc0d389e9d05535ca83c35f90241d
Successfully built scapy
Installing collected packages: scapy
Successfully installed scapy-2.5.0


In [None]:
from scapy.all import rdpcap, IP
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from collections import defaultdict
from graph_tool.all import Graph

class DataBootstrap:
    def __init__(self, pcap_file):
        self.pcap_file = pcap_file

    def pcap_to_parquet(self, pcap_file):
        packets = rdpcap(pcap_file)
        data = []
        for packet in packets:
            if IP in packet:
                time = packet.time
                source = packet[IP].src
                destination = packet[IP].dst
                protocol = packet[IP].proto
                length = len(packet)
                info = packet.summary()
                data.append([time, source, destination, protocol, length, info])
        df = pd.DataFrame(data, columns=['time', 'source', 'destination', 'protocol', 'length', 'info'])
        parquet_file = "dummy.parquet"
        table = pa.Table.from_pandas(df)
        pq.write_table(table, parquet_file)
        return df, parquet_file

    def extract_flow_data(self, pq_file):
        flow_df = pd.read_parquet(pq_file)
        flow_data = []
        for index, row in flow_df.iterrows():
            if 'IP' in row['info']:
                sip = row['source']
                dip = row['destination']
                srcpkts = 1
                dstpkts = 0
                flow_data.append((sip, dip, srcpkts, dstpkts))
        return flow_data

    def flow_ingestion(self, flow_data):
        aggregated_flows = defaultdict(lambda: {'srcpkts': 0, 'dstpkts': 0})

        for flow in flow_data:
            sip, dip, srcpkts, dstpkts = flow
            key = (sip, dip)
            reverse = (dip, sip)
            if key in aggregated_flows:
                aggregated_flows[key]['srcpkts'] += srcpkts
                aggregated_flows[key]['dstpkts'] += dstpkts
            elif reverse in aggregated_flows:
                aggregated_flows[reverse]['srcpkts'] += dstpkts
                aggregated_flows[reverse]['dstpkts'] += srcpkts
            else:
                aggregated_flows[key] = {'srcpkts': srcpkts, 'dstpkts': dstpkts}
        return aggregated_flows



    def build_graph(self, aggregated_flows):
        g = Graph(directed=True)

        edge_weight1 = g.new_edge_property("float")
        edge_weight2 = g.new_edge_property("float")

        srcpkts = g.new_edge_property("float")
        dstpkts = g.new_edge_property("float")

        vertex_map = {}
        for (v1, v2), data in aggregated_flows.items():
            weight1 = data['srcpkts']
            weight2 = data['dstpkts']

            if v1 not in vertex_map:
                vertex_map[v1] = g.add_vertex()
            if v2 not in vertex_map:
                vertex_map[v2] = g.add_vertex()

            v1_index = vertex_map[v1]
            v2_index = vertex_map[v2]

            if weight1 > 0:
                e1 = g.add_edge(v1_index, v2_index)
                edge_weight1[e1] = weight1
                srcpkts[e1] = weight1
            if weight2 > 0:
                e2 = g.add_edge(v2_index, v1_index)
                edge_weight2[e2] = weight2
                dstpkts[e2] = weight2

        g.edge_properties["srcpkts"] = srcpkts
        g.edge_properties["dstpkts"] = dstpkts

        return g



    def run(self):
        self.df, self.parquet_file = self.pcap_to_parquet(self.pcap_file)
        self.flow_data = self.extract_flow_data(self.parquet_file)
        self.aggregated_flow_data = self.flow_ingestion(self.flow_data)
        self.graph = self.build_graph(self.aggregated_flow_data)
        return self.df, self.graph


pcap_file = "/content/1.pcap"
df, graph = DataBootstrap(pcap_file).run()

print(df, graph)

                   time           source      destination  protocol  length  \
0     1712059319.236070      192.168.1.8     202.88.152.8        17     100   
1     1712059319.255923     202.88.152.8      192.168.1.8        17     516   
2     1712059320.040750      192.168.1.8      224.0.0.251        17      87   
3     1712059334.316591      192.168.1.7  239.255.255.250        17     698   
4     1712059334.434051      192.168.1.7  239.255.255.250        17     698   
...                 ...              ...              ...       ...     ...   
5722  1712066693.956085  142.250.196.174      192.168.1.8        17    1399   
5723  1712066693.956085  142.250.196.174      192.168.1.8        17    1399   
5724  1712066693.956085  142.250.196.174      192.168.1.8        17    1399   
5725  1712066693.956085  142.250.196.174      192.168.1.8        17    1399   
5726  1712066693.956085  142.250.196.174      192.168.1.8        17     323   

                                                   

In [None]:
from graph_tool import centrality as gt_centrality
from graph_tool.all import *
from graph_tool import topology
from graph_tool.centrality import pagerank as gt_pagerank
from graph_tool import stats as gt_stats
import pandas as pd


class GraphFeatures:
    def __init__(self, graph):
        self.graph = graph

    def calculate_properties(self, graph):
        properties = {
        'ID': self.graph.get_in_degrees(self.graph.get_vertices()),
        'OD': self.graph.get_out_degrees(self.graph.get_vertices()),
        'IDW': self.graph.edge_properties["srcpkts"].a,
        'ODW':self.graph.edge_properties["dstpkts"].a,
        'IDC': gt_pagerank(self.graph, weight=self.graph.edge_properties["srcpkts"]),
        'ODC': gt_pagerank(self.graph, weight=self.graph.edge_properties["dstpkts"]),
        'BC': gt_centrality.betweenness(self.graph)[0],  # Update this line
        'CC': gt_centrality.closeness(self.graph),
        'KC': gt_centrality.katz(self.graph),  # Update this line
        'PR': gt_centrality.pagerank(self.graph),  # Use pagerank function from centrality module
        'Hub': gt_centrality.hits(self.graph, weight=self.graph.edge_properties["srcpkts"])[0],
        'Authority': gt_centrality.hits(self.graph, weight=self.graph.edge_properties["dstpkts"])[1],
	'LCC': graph_tool.clustering.local_clustering(self.graph)
    }
        return properties

    def write_df(self, properties):
        vertex_indices = range(self.graph.num_vertices())
        data = {
            'Vertex': vertex_indices,
            'ID': [properties['ID'][v] for v in vertex_indices],
            'OD': [properties['OD'][v] for v in vertex_indices],
            'IDW': [properties['IDW'][v] if v in properties['IDW'] else 0 for v in vertex_indices],
            'ODW': [properties['ODW'][v] if v in properties['ODW'] else 0 for v in vertex_indices],
            'IDC': [properties['IDC'][v] if v in properties['IDC'] else 0 for v in vertex_indices],
            'ODC': [properties['ODC'][v] if v in properties['ODC'] else 0 for v in vertex_indices],
            'BC': [properties['BC'][v] if v in properties['BC'] else 0 for v in vertex_indices],
            'CC': [properties['CC'][v] if v in properties['CC'] else 0 for v in vertex_indices],
            'KC': [properties['KC'][v] if v in properties['KC'] else 0 for v in vertex_indices],
            'PR': [properties['PR'][v] if v in properties['PR'] else 0 for v in vertex_indices],
            'Hub': [properties['Hub'] for _ in vertex_indices],
            'Authority': [properties['Authority'][v] if v in properties['Authority'] else 0 for v in vertex_indices],
            'LCC': [properties['LCC'][v] if v in properties['LCC'] else 0 for v in vertex_indices],
        }
        df = pd.DataFrame(data)
        return df

    def run(self):
        self.properties = self.calculate_properties(self.graph)
        self.prop_df = self.write_df(self.properties)
        return self.prop_df


prop_df = GraphFeatures(graph).run()

print(prop_df)

    Vertex  ID  OD    IDW    ODW  IDC  ODC        BC        CC  KC  PR  \
0        0  38  39   73.0    0.0    0    0  0.613946  0.000000   0   0   
1        1   1   1    0.0    0.0    0    0  0.000000  0.506494   0   0   
2        2   6   0    0.0    0.0    0    0  0.000000  0.000000   0   0   
3        3   0   3    0.0    0.0    0    0  0.000000  0.000000   0   0   
4        4   2   0  329.0    0.0    0    0  0.000000  0.000000   0   0   
5        5   0   1   13.0    0.0    0    0  0.000000  0.000000   0   0   
6        6   1   1    0.0   13.0    0    0  0.000000  0.000000   0   0   
7        7   1   1   71.0    0.0    0    0  0.000000  0.000000   0   0   
8        8   1   1    0.0   65.0    0    0  0.000000  0.000000   0   0   
9        9   1   1   12.0    0.0    0    0  0.000000  0.000000   0   0   
10      10   1   1    0.0   11.0    0    0  0.000000  0.000000   0   0   
11      11   1   1   77.0    0.0    0    0  0.000000  0.000000   0   0   
12      12   0   2    0.0   83.0    0 

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

class DataPreprocessing:
    def __init__(self, X):
        self.X = X

    def impute_missing_values(self, strategy='mean'):
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(self.X)
        return X_imputed

    def standardize_features(self):
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(self.X)
        return X_scaled

    def run(self):
        self.imputed = self.impute_missing_values()  # Fix: Remove self.X from impute_missing_values
        self.cleaned_df = self.standardize_features()  # Fix: Remove self.imputed from standardize_features
        return self.cleaned_df



cleaned_df = DataPreprocessing(prop_df).run()

print(cleaned_df)

[[-1.69774938  6.91295232  6.97791963  0.26895298 -0.17391032  0.
   0.          7.         -0.14285714  0.          0.          0.
   7.          0.        ]
 [-1.62845348 -0.14108066 -0.13858456 -0.32564325 -0.17391032  0.
   0.         -0.14285714  7.          0.          0.          0.
  -0.14285714  0.        ]
 [-1.55915759  0.81216704 -0.32586098 -0.32564325 -0.17391032  0.
   0.         -0.14285714 -0.14285714  0.          0.          0.
  -0.14285714  0.        ]
 [-1.4898617  -0.3317302   0.2359683  -0.32564325 -0.17391032  0.
   0.         -0.14285714 -0.14285714  0.          0.          0.
  -0.14285714  0.        ]
 [-1.4205658   0.04956888 -0.32586098  2.35411237 -0.17391032  0.
   0.         -0.14285714 -0.14285714  0.          0.          0.
  -0.14285714  0.        ]
 [-1.35126991 -0.3317302  -0.13858456 -0.21975625 -0.17391032  0.
   0.         -0.14285714 -0.14285714  0.          0.          0.
  -0.14285714  0.        ]
 [-1.28197402 -0.14108066 -0.13858456 -0.32564

In [None]:
import pandas as pd
import joblib

class PredictionModel:
    def __init__(self, X, info_df):
        self.X = X
        self.info_df = info_df

    def predictions(self):
        self.X = pd.DataFrame(self.X)

        dt_loaded_model = joblib.load('decision_tree_model.pkl')
        hgb_loaded_model = joblib.load('hist_gradient_boosting_model.pkl')
        rf_loaded_model = joblib.load('random_forest_model.pkl')

        dt_predictions = []
        hgb_predictions = []
        rf_predictions = []
        for record in self.X.values:
            dt_prediction = dt_loaded_model.predict([record])[0]
            hgb_prediction = hgb_loaded_model.predict([record])[0]
            rf_prediction = rf_loaded_model.predict([record])[0]
            dt_predictions.append(dt_prediction)
            hgb_predictions.append(hgb_prediction)
            rf_predictions.append(rf_prediction)

        predictions_df = pd.DataFrame({
            'DecisionTree_Prediction': dt_predictions,
            'HistGradientBoosting_Prediction': hgb_predictions,
            'RandomForest_Prediction': rf_predictions
        })

        return predictions_df

    def merge_csv(self):
        self.merged_df = pd.concat([self.info_df, self.predictions_df], axis=1)
        return self.merged_df

    def run(self):
        self.predictions_df = self.predictions()
        self.merged_pd = self.merge_csv()
        self.csv_filename = 'merged_data.csv'
        self.merged_df.to_csv(self.csv_filename, index=False)
        return self.csv_filename


# Call the PredictionModel class
csv = PredictionModel(cleaned_df, prop_df).run()

In [None]:
print(csv)

merged_data.csv
