In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import random
import ipaddress  # For advanced IP address manipulation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Data Simulation

In [2]:
def generate_synthetic_network_data(num_samples=10000, anomaly_ratio=0.01):
    """
    Generates a synthetic dataset of network connections. 
    A small fraction (anomaly_ratio) simulate possible data exfil (anomalies).
    """
    random.seed(42)
    np.random.seed(42)
    
    data = {
        'src_ip': [],
        'dst_ip': [],
        'bytes_out': [],
        'bytes_in': [],
        'hour_of_day': [],
        'label': []  # 0 = normal, 1 = potential exfil/anomaly
    }
    
    num_anomalies = int(num_samples * anomaly_ratio)
    
    for i in range(num_samples):
        hour = np.random.randint(0, 24)
        is_anomaly = (i < num_anomalies)
        data['label'].append(int(is_anomaly))
        
        if is_anomaly:
            # Potential exfil: large bytes_out, traffic to an external global IP
            src_ip = f"10.{np.random.randint(0,255)}.{np.random.randint(0,255)}.{np.random.randint(0,255)}"
            dst_ip = f"{np.random.randint(100,255)}.{np.random.randint(0,255)}.{np.random.randint(0,255)}.{np.random.randint(0,255)}"
            bytes_out = np.random.randint(500000, 2000000)  # large data transfer
            bytes_in  = np.random.randint(0, 1000)
        else:
            # Normal traffic stays in private range
            src_ip = f"10.{np.random.randint(0,255)}.{np.random.randint(0,255)}.{np.random.randint(0,255)}"
            dst_ip = f"10.{np.random.randint(0,255)}.{np.random.randint(0,255)}.{np.random.randint(0,255)}"
            bytes_out = np.random.randint(0, 50000)
            bytes_in  = np.random.randint(0, 50000)

        data['src_ip'].append(src_ip)
        data['dst_ip'].append(dst_ip)
        data['bytes_out'].append(bytes_out)
        data['bytes_in'].append(bytes_in)
        data['hour_of_day'].append(hour)
    
    return pd.DataFrame(data)

df = generate_synthetic_network_data(num_samples=10000, anomaly_ratio=0.01)
df.head()


Unnamed: 0,src_ip,dst_ip,bytes_out,bytes_in,hour_of_day,label
0,10.179.92.14,206.71.188.20,1603462,121,6,1
1,10.214.74.202,187.116.99.103,778167,130,18,1
2,10.52.1.87,137.129.191.187,1784372,160,21,1
3,10.57.21.252,188.48.218.58,1933257,475,11,1
4,10.14.189.189,150.107.54.243,1055839,504,15,1


# Feature Engineering

### IP Feature Extraction Function

In [3]:
def extract_ip_features(ip_str):
    """
    Convert an IP string into a vector of features using `ipaddress` library.
    
    Returns a list of numeric features, for example:
    [
      is_private, 
      is_global,
      is_loopback,
      is_link_local,
      is_reserved,
      is_multicast,
      ip_version,
      integer_representation
    ]
    """
    ip_obj = ipaddress.ip_address(ip_str)
    
    is_private = int(ip_obj.is_private)
    is_global = int(ip_obj.is_global)
    is_loopback = int(ip_obj.is_loopback)
    is_link_local = int(ip_obj.is_link_local)
    is_reserved = int(ip_obj.is_reserved)
    is_multicast = int(ip_obj.is_multicast)
    ip_version = ip_obj.version  # typically 4 or 6
    
    # Convert IP to an integer (for IPv4, this is a 32-bit number)
    ip_int = int(ip_obj)
    
    return [
        is_private,
        is_global,
        is_loopback,
        is_link_local,
        is_reserved,
        is_multicast,
        ip_version,
        ip_int
    ]