Import datasets

In [1]:
import pandas as pd
import numpy as np

EDGE_COLS = [
    'Bwd Packet Length Min', 'Protocol_6', 'Bwd Packets/s', 'FWD Init Win Bytes',
    'Packet Length Std', 'FIN Flag Count', 'SrcPortRange_registered',
    'Packet Length Min', 'Fwd Seg Size Min', 'DstPortRange_well_known',
    'Bwd IAT Total', 'SYN Flag Count', 'Bwd Packet Length Std'
]
LABEL_COL = "target"
ID_COLS = ['Src IP', 'Dst IP', 'Timestamp']

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

X_train = df_train.drop(columns=[LABEL_COL, "Src IP", "Dst IP", "Timestamp"])
y_train = df_train[LABEL_COL]
X_test  = df_test.drop(columns=[LABEL_COL, "Src IP", "Dst IP", "Timestamp"])
y_test  = df_test[LABEL_COL]

print(df_train.shape)
print(df_train.columns)
print(df_train.loc[1])

(25901651, 17)
Index(['Timestamp', 'Src IP', 'Dst IP', 'Bwd Packet Length Min', 'Protocol_6',
       'Bwd Packets/s', 'FWD Init Win Bytes', 'Packet Length Std',
       'FIN Flag Count', 'SrcPortRange_registered', 'Packet Length Min',
       'Fwd Seg Size Min', 'DstPortRange_well_known', 'Bwd IAT Total',
       'SYN Flag Count', 'Bwd Packet Length Std', 'target'],
      dtype='object')
Timestamp                  2018-02-16 12:38:45.787171
Src IP                                   172.31.66.26
Dst IP                                  23.219.88.169
Bwd Packet Length Min                               0
Protocol_6                                       True
Bwd Packets/s                                0.133515
FWD Init Win Bytes                               8192
Packet Length Std                          113.214348
FIN Flag Count                                      0
SrcPortRange_registered                         False
Packet Length Min                                   0
Fwd Seg Size Min  

#### Recreate the full dataset

In [2]:
df_all = pd.concat([df_train, df_test], ignore_index=True)
df_all.shape

(32377064, 17)

#### Extract all unique communications

In [3]:
import networkx as nx

# Get unique pairs of communicating IPs
edges = df_all[['Src IP', 'Dst IP']].drop_duplicates()

# Create the binary graph
G = nx.Graph()
G.add_edges_from(edges.itertuples(index=False, name=None))

print(f"Graph built with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

Graph built with 107583 nodes and 1358779 edges


#### Generate node embeddings

In [4]:
# from node2vec import Node2Vec

# node2vec = Node2Vec(
#     G,
#     dimensions=128,   # embedding dimension
#     walk_length=20,
#     num_walks=100,
#     workers=4
# )
# model = node2vec.fit(window=10, min_count=1, batch_words=4)

# # Save or access embeddings
# embeddings = {node: model.wv[node] for node in G.nodes()}

better perfmormance

In [5]:
# from karateclub.node_embedding.neighbourhood import Node2Vec

# model = Node2Vec(dimensions=128, walk_length=10, num_walks=10, workers=8)
# model.fit(G)
# embeddings = model.get_embedding()

##### Generate node embeddings using torch

In [6]:
# 1. Convert node labels to integers

import networkx as nx
import torch

# Create mapping from node -> integer ID
mapping = {node: i for i, node in enumerate(G.nodes())}

# Relabel graph
G_int = nx.relabel_nodes(G, mapping)

# Convert to edge_index tensor
edge_index = torch.tensor(list(G_int.edges)).t().contiguous().long()

print(edge_index.shape)
print(edge_index[:, :5])  # preview first edges

torch.Size([2, 1358779])
tensor([[0, 0, 0, 0, 0],
        [1, 2, 3, 4, 5]])


In [None]:
# 2. Continue with Node2Vec
from torch_geometric.nn import Node2Vec

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = Node2Vec(
    edge_index,
    embedding_dim=64,          # smaller = faster
    walk_length=10,
    context_size=5,
    walks_per_node=5,
    num_negative_samples=1,
    sparse=True
).to(device)

generate embeddings

In [None]:

# Quick training
# loader = model.loader(batch_size=256, shuffle=True, num_workers=8)
# fix for dataloader bug
loader = model.loader(batch_size=128, shuffle=True, num_workers=0)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(3):  # only a few epochs for speed
    loss = train()
    print(f"Epoch {epoch}, loss: {loss:.4f}")

Epoch 0, loss: 3.1729
Epoch 1, loss: 1.9806
Epoch 2, loss: 1.3766


In [13]:
# Extract embeddings
emb = model.embedding.weight.cpu().detach().numpy()

### Map embeddings back to IPs

In [44]:
# Reverse mapping: integer ID -> IP
rev_mapping = {v: k for k, v in mapping.items()}

rand_flow = list(G.edges())[0]
print(rand_flow)
rand_ip = rand_flow[1]

ip_emb = emb[mapping[rand_ip]]
print(ip_emb)

('172.31.66.26', '72.21.91.29')
[-0.00854632  0.49965182 -0.11755073  0.23856659 -0.192481   -0.79347897
  0.39562842 -0.57249093 -0.49602044  0.42577937  0.26082548 -0.00324773
 -0.23080257  0.55366075 -0.14265102 -0.42933515  0.5296962  -0.1765836
  0.49596563  0.06597757  0.75427926  0.48870236  0.85588676 -0.7022354
  0.7738369   1.2348714   0.77049667 -0.06970648 -0.08437681  0.5419675
 -1.4111848   0.01033599 -0.5393135  -1.9422607   0.06130738  0.43619543
  0.11422279 -0.5507264   0.30981502 -0.20507352 -0.36775368 -0.8699458
 -0.1363299   0.99207765 -0.4000832  -0.63919824 -0.3429443  -0.19293809
 -0.30460325 -0.21815975 -0.66996694 -0.49979997  0.43131134 -0.20151842
 -0.19916423  0.32269636  0.15019001 -0.21957003  0.12967624 -0.0734797
  0.11119702  0.25410324  0.08673614  0.41298044]


In [32]:
node_ids = list(mapping.keys())  # original IPs
embedding_df = pd.DataFrame(emb, index=node_ids)

def add_node2vec_features(df, embedding_df):
    # Merge source and destination embeddings
    src_emb = embedding_df.loc[df['Src IP']].reset_index(drop=True)
    dst_emb = embedding_df.loc[df['Dst IP']].reset_index(drop=True)
    
    # Rename columns
    src_emb.columns = [f'src_emb_{i}' for i in range(src_emb.shape[1])]
    dst_emb.columns = [f'dst_emb_{i}' for i in range(dst_emb.shape[1])]
    
    # Concatenate with original features
    df_with_emb = pd.concat([df.reset_index(drop=True), src_emb, dst_emb], axis=1)
    return df_with_emb

train_with_emb = add_node2vec_features(df_train, embedding_df)
test_with_emb  = add_node2vec_features(df_test, embedding_df)

# Now your X_train includes edge features + src/dst embeddings
X_train_full = train_with_emb.drop(columns=['target', 'Src IP', 'Dst IP', 'Timestamp'])
y_train = train_with_emb['target']

In [45]:
print(train_with_emb.loc[0])
# print(X_train_full.loc[123_456])

Timestamp                2018-02-16 12:38:47.587883
Src IP                                 172.31.66.26
Dst IP                                  72.21.91.29
Bwd Packet Length Min                             0
Protocol_6                                     True
                                    ...            
dst_emb_59                                 -0.07348
dst_emb_60                                 0.111197
dst_emb_61                                 0.254103
dst_emb_62                                 0.086736
dst_emb_63                                  0.41298
Name: 0, Length: 145, dtype: object
