In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/netflow/CIDDS-001/traffic/mixed.csv', index_col='DATE')
df.drop(['FLAGS', 'CLASS', 'ATTACK_ID', 'ATTACK_DESCRIPTION'], axis=1, inplace=True)

df = df.sample(frac=1).iloc[:1000]

df['SRC'] = df['SRC_IP_ADDR'].str.cat(df['SRC_PORT'].astype(str), sep=':')
df['DST'] = df['DST_IP_ADDR'].str.cat(df['DST_PORT'].astype(str), sep=':')
df.drop(['SRC_IP_ADDR', 'SRC_PORT', 'DST_IP_ADDR', 'DST_PORT'], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,DURATION,PROTOCOL,PACKETS,BYTES,TOS,ATTACK_TYPE,TYPE,SRC,DST
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-03-27 08:50:48.506,0.425,TCP,14,5390.0,32,benign,internal,10433_120:443,192.168.210.5:51340
2017-03-27 12:17:12.579,0.0,TCP,1,66.0,32,benign,internal,14469_126:443,192.168.220.15:45510
2017-03-22 10:43:39.938,0.0,UDP,1,161.0,0,benign,internal,DNS:53,192.168.210.4:60918
2017-03-24 06:10:27.875,0.213,TCP,5,865.0,0,benign,internal,192.168.220.4:45889,10371_48:80
2017-03-19 02:55:40.569,0.042,TCP,6,515.0,0,benign,internal,OPENSTACK_NET:59648,EXT_SERVER:8000


In [3]:
from sklearn.preprocessing import LabelEncoder

def label_encode(series_to_encode):
    lb_e = LabelEncoder()
    return lb_e, lb_e.fit_transform(series_to_encode)

FEATURES_TO_LABELENCODE = ['PROTOCOL', 'TYPE', 'TOS', 'ATTACK_TYPE', 'SRC', 'DST']
encoders = {}

for f in FEATURES_TO_LABELENCODE:
    print("Processing feature {}".format(f))
    encoders[f], df[f] = label_encode(df[f])

Processing feature PROTOCOL
Processing feature TYPE
Processing feature TOS
Processing feature ATTACK_TYPE
Processing feature SRC
Processing feature DST


In [4]:
nodes_df = pd.concat([
    df[['SRC', 'TYPE']].rename(columns={'SRC': 'ID'}).reset_index(drop=True), 
    df[['DST', 'TYPE']].rename(columns={'DST': 'ID'}).reset_index(drop=True)
]).drop_duplicates()

nodes_df['COLOR'] = np.where(nodes_df['TYPE'] == 0, "#107be6", "#000000")
nodes_df.drop('TYPE', axis=1, inplace=True)

nodes_df.head()

Unnamed: 0,ID,COLOR
0,133,#000000
1,232,#000000
2,721,#000000
3,613,#000000
4,735,#000000


In [5]:
edges_df = df[['SRC', 'DST', 'DURATION', 'PROTOCOL', 'PACKETS', 'BYTES', 'TOS', 'ATTACK_TYPE']]
edges_df['SPEED'] = edges_df['BYTES'] / edges_df['DURATION']
edges_df['SPEED'] = edges_df['SPEED'].replace([np.inf, -np.inf], 0.01)

edges_df.reset_index(inplace=True, drop=True)

edges_df.head()

Unnamed: 0,SRC,DST,DURATION,PROTOCOL,PACKETS,BYTES,TOS,ATTACK_TYPE,SPEED
0,133,315,0.425,1,14,5390.0,1,0,12682.352941
1,232,493,0.0,1,1,66.0,1,0,0.01
2,721,308,0.0,2,1,161.0,0,0,0.01
3,613,132,0.213,1,5,865.0,0,0,4061.032864
4,735,722,0.042,1,6,515.0,0,0,12261.904762


In [6]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('notebooks'), '..'))

from graphs.multigraph import Multigraph
from embeddings.b_word2vec import Word2VecBaseline

In [7]:
multigraph = Multigraph()

multigraph.add_nodes(nodes_df)
multigraph.add_edges(edges_df)

# multigraph.plot()

In [10]:
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph
from gensim.models import Word2Vec

In [15]:
!pip install gensim==4.0.0



In [16]:
networkx_graph = multigraph._G

rw = BiasedRandomWalk(StellarGraph.from_networkx(networkx_graph))

walks = (
    [
        list(map(str, walk))
        for walk in rw.run(
            nodes=list(networkx_graph.nodes()),  # root nodes
            length=100,  # maximum length of a random walk
            n=10,  # number of random walks per root node
            p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
            q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
        )
    ],
)

In [21]:
Word2Vec(walks, size=4, window=5, min_count=0, sg=1, workers=2, iter=1)

TypeError: __init__() got an unexpected keyword argument 'size'

In [8]:
embeddings = Word2VecBaseline.from_networkx_graph(multigraph._G)

TypeError: __init__() got an unexpected keyword argument 'size'

In [None]:
#embeddings.fit()

In [None]:
#embeddings.node_embeddings

In [None]:
#embeddings.node_embeddings_id