In [None]:
import pandas as pd
import polars as pl
import numpy  as np
import networkx as nx
from math import sin, cos, pi, sqrt, atan2
from os.path import exists
import time
from rtsvg import *
rt = RACETrack()
ts1 = time.time()
df = pl.concat([pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk1.csv'),
                pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk2.csv'),
                pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk3.csv')])
df = rt.columnsAreTimestamps(df, 'parsedDate')
ts2 = time.time()
print(f'Loading Time ... {ts2 - ts1:0.2} sec')

df = df.drop(['TimeSeconds',
              #'parsedDate',
              'dateTimeStr',
              #'ipLayerProtocol',
              'ipLayerProtocolCode',
              #'firstSeenSrcIp',
              #'firstSeenDestIp',
              #'firstSeenSrcPort',
              #'firstSeenDestPort',
              'moreFragments',
              'contFragments',
              #'durationSeconds',
              'firstSeenSrcPayloadBytes',
              'firstSeenDestPayloadBytes',
              #'firstSeenSrcTotalBytes',
              #'firstSeenDestTotalBytes',
              #'firstSeenSrcPacketCount',
              #'firstSeenDestPacketCount',
              'recordForceOut'])

df = df.rename({'parsedDate':'ts',
               'ipLayerProtocol':'pro',
               'firstSeenSrcIp':'sip',
               'firstSeenDestIp':'dip',
               'firstSeenSrcPort':'spt',
               'firstSeenDestPort':'dpt',
               'durationSeconds':'dur',
               'firstSeenSrcTotalBytes':'soct',
               'firstSeenDestTotalBytes':'doct',
               'firstSeenSrcPacketCount':'spkt',
               'firstSeenDestPacketCount':'dpkt'})

df.sample(3)

In [None]:
layout_file = '../../data/2013_vast_challenge/mc3_netflow/spring_layout.csv'
relates = [('sip','dip')]
g       = rt.createNetworkXGraph(df, relates)
pos = {} if exists(layout_file) else nx.spring_layout(g)
#rt.link(df, relates, pos)

In [None]:
df_uniqs = df.unique(subset=['sip','dip'])
total_nodes = len(set(df_uniqs['sip']) | set(df_uniqs['dip']))
print(f'{len(df_uniqs)=} | {total_nodes=}')
_igl_ = rt.interactiveGraphLayout(df_uniqs, {'relationships':relates, 'pos':pos, 'draw_labels':False, 'bounds_percent':0.02}, w=1200, h=800)
if exists(layout_file): _igl_.loadLayout(layout_file)
#_igl_

In [None]:
#_igl_.saveLayout(layout_file)
#print(_igl_)

In [None]:
# Collapse the original graph into one based on position of nodes in xy coordinates
g      = rt.createNetworkXGraph(df_uniqs, relates)
_link_ = rt.link(df_uniqs, relates, pos, w=1000,h=600)
_link_.renderSVG() # force a render so that xT and yT exist
all_nodes    = set(df_uniqs['sip']) | set(df_uniqs['dip'])
all_nodes_ls = list(all_nodes)
x_min, y_min, x_max, y_max = 1e9, 1e9, -1e9, -1e9 # probably not a safe assumption :(
node_to_xy = {}
xy_to_node = {}
for node in all_nodes:
    x,y              = _link_.xT(pos[node][0]), _link_.yT(pos[node][1])
    xy               = (x,y)
    x_min, x_max     = min(x_min, x), max(x_max, x)
    y_min, y_max     = min(y_min, y), max(y_max, y)
    node_to_xy[node] = xy
    if xy not in xy_to_node: xy_to_node[xy] = []
    xy_to_node[xy].append(node)
#_link_

In [None]:
import hdbscan
vecs  = []
xy_lu = {'x':[], 'y':[], 'c':[]}
for node in all_nodes_ls:
    xy  = node_to_xy[node]
    v_i = [(xy[0]-x_min)/(x_max-x_min), (xy[1]-y_min)/(y_max-y_min)]
    vecs.append(v_i)
    xy_lu['x'].append(xy[0]), xy_lu['y'].append(xy[1])
clusterer = hdbscan.HDBSCAN()
clusterer.fit(vecs)
for c in clusterer.labels_: xy_lu['c'].append(c)
#rt.xy(pl.DataFrame(xy_lu), x_field='x', y_field='y', color_by='c', w=800, h=600)

In [None]:
degrees_lu = {'degree':[]}
for node in all_nodes_ls: degrees_lu['degree'].append(g.degree(node))
#rt.histogram(pl.DataFrame(degrees_lu), bin_by='degree')
degree_histogram_lu = {'threshold':[], 'count':[]}
for i in range (500, 1000):
    count = 0
    for node in all_nodes_ls:
        if g.degree(node) >= i: count += 1
    degree_histogram_lu['threshold'].append(i), degree_histogram_lu['count'].append(count)
rt.xy(pl.DataFrame(degree_histogram_lu), x_field='threshold', y_field='count')