In [None]:
import pandas as pd
import polars as pl
import numpy as np
import heapq
import sys
sys.path.insert(1, '../framework')
from racetrack import *
rt = RACETrack()

In [None]:
df = pd.DataFrame({'fm':['a','a','b','b','b','c','e','f'],
                   'to':['b','d','a','c','d','d','d','d']})
params = {'df':df, 'relationships':[('fm','to')], 'draw_labels':True, 'txt_h':20, 'w':256, 'h':256, 'x_ins':10, 'y_ins':10}
rt.tile([rt.chordDiagram(                                           **params),
         rt.chordDiagram(link_style='wide',                         **params),
         rt.chordDiagram(                   equal_size_nodes=True,  **params),
         rt.chordDiagram(link_style='wide', equal_size_nodes=True,  **params)])

In [None]:
# 2024-03-08 - doesn't work ... module 'networkx' has no attribute 'to_scipy_sparse_matrix'
#
# Following from the documentation located here:
#
# https://markov-clustering.readthedocs.io/en/latest/readme.html
#

#import markov_clustering as mc
#import networkx as nx
#import random
## number of nodes to use
#numnodes = 200
## generate random positions as a dictionary where the key is the node id and the value
## is a tuple containing 2D coordinates
#positions = {i:(random.random() * 2 - 1, random.random() * 2 - 1) for i in range(numnodes)}
## use networkx to generate the graph
#network = nx.random_geometric_graph(numnodes, 0.3, pos=positions)
## then get the adjacency matrix (in sparse form)
#matrix = nx.to_scipy_sparse_matrix(network)

In [None]:
load_lots_of_netflow = True
if load_lots_of_netflow:
    _base_ = '../../data/2013_vast_challenge/mc3_netflow/nf/'
    df = pl.concat([pl.read_csv(_base_ + 'nf-chunk1.csv'),
                    pl.read_csv(_base_ + 'nf-chunk2.csv'),
                    pl.read_csv(_base_ + 'nf-chunk3.csv')])
    df = df.rename({'TimeSeconds':'secs',                  'parsedDate':'timestamp',                'dateTimeStr':'timestamp_str',
                    'ipLayerProtocol':'pro_str',           'ipLayerProtocolCode':'pro',             'firstSeenSrcIp':'sip',
                    'firstSeenDestIp':'dip',               'firstSeenSrcPort':'spt',                'firstSeenDestPort':'dpt',
                    'moreFragments':'mfrag',               'contFragments':'cfrag',                 'durationSeconds':'dur',
                    'firstSeenSrcPayloadBytes':'soct_pay', 'firstSeenDestPayloadBytes':'doct_pay',  'firstSeenSrcTotalBytes':'soct',
                    'firstSeenDestTotalBytes':'doct',      'firstSeenSrcPacketCount':'spkt',        'firstSeenDestPacketCount':'dpkt',
                    'recordForceOut':'out'})
    df = df.sample(1000000)
else:
    df = pl.DataFrame({'sip':['1.2.3.4'], 'dip':['5.6.7.8']})
cd = rt.chordDiagram(df, [('sip','dip')], equal_size_nodes=True, draw_labels=False, txt_h=16, w=200, h=200, x_ins=2, y_ins=2)
cd

In [None]:
import hdbscan

handled   = set()
fmto_list = []
span_list = []
xs_list   = []
ys_list   = []
for node in cd.node_dir_arc:
    for fm in cd.node_dir_arc[node]:
        for to in cd.node_dir_arc[node][fm]:
            key = str(fm) + '|' + str(to)
            if key not in handled:
                handled.add(key)
                fm_span  = cd.node_dir_arc[node][fm][to]
                fm_coord = (fm_span[0]+fm_span[1])/720.0 - 0.5
                xs_list.append(fm_coord)
                if fm == node:
                    to_span  = cd.node_dir_arc[to][fm][to]
                    to_coord = (to_span[0]+to_span[1])/720.0 - 0.5
                    fmto_list.append(key)
                    span_list.append((fm_coord,to_coord))
                    ys_list.append(to_coord)
                else:
                    to_span  = cd.node_dir_arc[fm][fm][to]
                    to_coord = (to_span[0]+to_span[1])/720.0 - 0.5
                    fmto_list.append(key)
                    span_list.append((fm_coord,to_coord))
                    ys_list.append(to_coord)
            else:
                pass
                # print(f'"{key}" already handled')
clusterer = hdbscan.HDBSCAN()
clusterer.fit(span_list)
print("n_clusters =", len(set(clusterer.labels_)))
rt.xy(pd.DataFrame({'x':xs_list,'y':ys_list,'c':clusterer.labels_}),x_field='x',y_field='y',color_by='c')

In [None]:
cd.time_lu