In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/netflow/CIDDS-001/traffic/mixed.csv', index_col='DATE')
df.drop(['FLAGS', 'CLASS', 'ATTACK_ID', 'ATTACK_DESCRIPTION'], axis=1, inplace=True)

df = df.sample(frac=1).iloc[:1000]

df['SRC'] = df['SRC_IP_ADDR'].str.cat(df['SRC_PORT'].astype(str), sep=':')
df['DST'] = df['DST_IP_ADDR'].str.cat(df['DST_PORT'].astype(str), sep=':')
df.drop(['SRC_IP_ADDR', 'SRC_PORT', 'DST_IP_ADDR', 'DST_PORT'], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,DURATION,PROTOCOL,PACKETS,BYTES,TOS,ATTACK_TYPE,TYPE,SRC,DST
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-03-26 13:55:23.696,0.0,UDP,1,262.0,0,benign,internal,192.168.200.5:138,192.168.200.255:138
2017-03-21 20:45:02.813,0.001,TCP,1,46.0,0,benign,external,10887_123:3551,EXT_SERVER:3306
2017-03-23 11:47:04.345,0.003,TCP,5,479.0,0,dos,internal,192.168.220.15:48261,192.168.100.6:80
2017-03-24 13:38:18.522,0.002,UDP,2,148.0,0,benign,internal,192.168.220.13:39725,DNS:53
2017-03-24 18:43:23.755,3.0,TCP,1,46.0,0,benign,external,22667_11:42424,EXT_SERVER:80


In [3]:
from sklearn.preprocessing import LabelEncoder

def label_encode(series_to_encode):
    lb_e = LabelEncoder()
    return lb_e, lb_e.fit_transform(series_to_encode)

FEATURES_TO_LABELENCODE = ['PROTOCOL', 'TYPE', 'TOS', 'ATTACK_TYPE', 'SRC', 'DST']
encoders = {}

for f in FEATURES_TO_LABELENCODE:
    print("Processing feature {}".format(f))
    encoders[f], df[f] = label_encode(df[f])

Processing feature PROTOCOL
Processing feature TYPE
Processing feature TOS
Processing feature ATTACK_TYPE
Processing feature SRC
Processing feature DST


In [4]:
nodes_df = pd.concat([
    df[['SRC', 'TYPE']].rename(columns={'SRC': 'ID'}).reset_index(drop=True), 
    df[['DST', 'TYPE']].rename(columns={'DST': 'ID'}).reset_index(drop=True)
]).drop_duplicates()

nodes_df['COLOR'] = np.where(nodes_df['TYPE'] == 0, "#107be6", "#000000")
nodes_df.drop('TYPE', axis=1, inplace=True)

nodes_df.head()

Unnamed: 0,ID,COLOR
0,255,#000000
1,181,#107be6
2,510,#000000
3,378,#000000
4,707,#107be6


In [5]:
edges_df = df[['SRC', 'DST', 'DURATION', 'PROTOCOL', 'PACKETS', 'BYTES', 'TOS', 'ATTACK_TYPE']]
edges_df['SPEED'] = edges_df['BYTES'] / edges_df['DURATION']
edges_df['SPEED'] = edges_df['SPEED'].replace([np.inf, -np.inf], 0.01)

edges_df.reset_index(inplace=True, drop=True)

edges_df.head()

Unnamed: 0,SRC,DST,DURATION,PROTOCOL,PACKETS,BYTES,TOS,ATTACK_TYPE,SPEED
0,255,253,0.0,2,1,262.0,0,0,0.01
1,181,717,0.001,1,1,46.0,0,0,46000.0
2,510,251,0.003,1,5,479.0,0,1,159666.666667
3,378,709,0.002,2,2,148.0,0,0,74000.0
4,707,724,3.0,1,1,46.0,0,0,15.333333


In [8]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('notebooks'), '..'))

from graphs.multigraph import Multigraph
from embeddings.b_word2vec import Word2VecBaseline

In [10]:
multigraph = Multigraph()

multigraph.add_nodes(nodes_df)
multigraph.add_edges(edges_df)

# multigraph.plot()

In [11]:
embeddings = Word2VecBaseline.from_networkx_graph(multigraph._G)

TypeError: unhashable type: 'list'

In [None]:
#embeddings.fit()

In [None]:
#embeddings.node_embeddings

In [None]:
#embeddings.node_embeddings_id