Import datasets

In [None]:
import pandas as pd
import numpy as np

EDGE_COLS = [
    'Bwd Packet Length Min', 'Protocol_6', 'Bwd Packets/s', 'FWD Init Win Bytes',
    'Packet Length Std', 'FIN Flag Count', 'SrcPortRange_registered',
    'Packet Length Min', 'Fwd Seg Size Min', 'DstPortRange_well_known',
    'Bwd IAT Total', 'SYN Flag Count', 'Bwd Packet Length Std'
]
LABEL_COL = "target"
ID_COLS = ['Src IP', 'Dst IP', 'Timestamp']

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

X_train = df_train.drop(columns=[LABEL_COL, "Src IP", "Dst IP", "Timestamp"])
y_train = df_train[LABEL_COL]
X_test  = df_test.drop(columns=[LABEL_COL, "Src IP", "Dst IP", "Timestamp"])
y_test  = df_test[LABEL_COL]

print(df_train.shape)
print(df_train.columns)
print(df_train.loc[1])

(25901651, 17)
Index(['Timestamp', 'Src IP', 'Dst IP', 'Bwd Packet Length Min', 'Protocol_6',
       'Bwd Packets/s', 'FWD Init Win Bytes', 'Packet Length Std',
       'FIN Flag Count', 'SrcPortRange_registered', 'Packet Length Min',
       'Fwd Seg Size Min', 'DstPortRange_well_known', 'Bwd IAT Total',
       'SYN Flag Count', 'Bwd Packet Length Std', 'target'],
      dtype='object')
Timestamp                  2018-02-16 12:38:45.787171
Src IP                                   172.31.66.26
Dst IP                                  23.219.88.169
Bwd Packet Length Min                               0
Protocol_6                                       True
Bwd Packets/s                                0.133515
FWD Init Win Bytes                               8192
Packet Length Std                          113.214348
FIN Flag Count                                      0
SrcPortRange_registered                         False
Packet Length Min                                   0
Fwd Seg Size Min  

Preprocessing & normalization

In [3]:
from sklearn.preprocessing import StandardScaler

# Concatenate train and test for consistent scaling
scaler = StandardScaler()
all_features = pd.concat([X_train, X_test], axis=0)

all_features_scaled = scaler.fit_transform(all_features)
X_train_scaled = all_features_scaled[:len(X_train)]
X_test_scaled  = all_features_scaled[len(X_train):]

In [6]:
df = pd.DataFrame(X_train_scaled)
print(df.shape)
print(df.columns)
print(df.loc[1])

(25901651, 13)
RangeIndex(start=0, stop=13, step=1)
0    -0.694499
1     0.803116
2    -0.096192
3    -0.005420
4    -0.279327
5    -0.684802
6    -0.339941
7    -0.694816
8     0.477040
9     0.465803
10    3.017777
11    0.809437
12   -0.273051
Name: 1, dtype: float64


Assign node indices (each flow = 1 node)

In [8]:
# Node indices: train first, then test
train_nodes = np.arange(len(X_train_scaled))
test_nodes  = np.arange(len(X_train_scaled), len(X_train_scaled) + len(X_test_scaled))

# Combine
X_all = np.vstack([X_train_scaled, X_test_scaled])
y_all = np.concatenate([y_train.values, y_test.values])
df_all = pd.concat([df_train[['Src IP', 'Dst IP']], df_test[['Src IP', 'Dst IP']]], axis=0).reset_index(drop=True)

In [None]:
# pd.DataFrame(y_all).loc[1]
# df_all.loc[1]

0    0
Name: 22290, dtype: int64

Build edges between flows that share an IP

In [19]:
from itertools import combinations

edges = set()
for col in ['Src IP', 'Dst IP']:
    ip_groups = df_all.groupby(col).groups  # dict: ip -> list of flow indices
    print(ip_groups)
    for ip, flow_idxs in ip_groups.items():
        # Connect all pairs of flows that share this IP
        for i, j in combinations(flow_idxs, 2):
            edges.add((i, j))
            edges.add((j, i))  # make undirected

# Convert to tensor
import torch
edge_index = torch.tensor(list(zip(*edges)), dtype=torch.long)
print(edge_index.shape)

{'1.0.0.3': [31139768], '1.0.128.64': [20283897, 20287246, 20292807], '1.0.132.203': [18957869], '1.0.161.218': [10170406, 10172183, 11944676, 11948968], '1.0.176.98': [13468303, 13744125, 13746679, 13749336, 13834945, 13965395, 13971586, 13980057, 14177945, 14372451, 14373253, 14457492, 14573664, 14574433, 14606124, 14817805, 14860694, 14866719, 15066277, 15079410, 15506571, 15608860, 15609123, 15625949, 15627816, 15697471, 15701436, 15702701, 15793825, 15800737, 15895062, 15913865, 15914881, 15966404, 16108685, 16308292, 16313374, 16466861, 16574168, 16765518, 17189417, 17260568, 17315283, 17317910, 17319986, 17343125, 17674099, 17757210, 17759053, 17763372, 17917987, 17919908, 17944661, 18052908, 18057400, 18060863, 18164741, 18168647, 18498857, 18504949, 18616572, 18909566, 18929717, 18984211, 19024740, 19086115, 19098699, 19100956, 19107396], '1.0.183.33': [25804786, 30101621, 30802974], '1.0.199.30': [8305231, 8310330], '1.1.129.0': [30549434, 30553489, 30553530, 30960865, 309610

KeyboardInterrupt: 