In [None]:
#graph creation for Trainning

import pandas as pd
import networkx as nx
import pickle
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load the dataset
df = pd.read_csv('dataset')

# Convert 'Timestamp' to datetime and set as index
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp', inplace=True)

# Sort the DataFrame by the index (Timestamp)
df.sort_index(inplace=True)

# Define time interval (Daily, hourly or per minutes .... )
time_interval = 'X'

# Updated required columns list based on the new features for edge attributes
required_columns = [
    'Src IP', 'Dst IP', 'Flow ID', 'Src Port', 'Dst Port', 'Protocol',
    'Flow Duration', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Flow IAT Max',
    'Flow IAT Mean', 'Flow IAT Min', 'Flow IAT Std', 'Fwd IAT Max',
    'Fwd IAT Mean', 'Fwd IAT Min', 'Fwd IAT Std', 'Bwd IAT Max',
    'Bwd IAT Mean', 'Bwd IAT Min', 'Bwd IAT Std', 'Active Max',
    'Active Mean', 'Active Min', 'Active Std', 'Label'
]

# Check for missing columns in the DataFrame
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"The following required columns are missing from the DataFrame: {missing_columns}")
else:
    print("All required columns are present in the DataFrame.")

def process_time_slice(slice_df, interval, index):
    G = nx.DiGraph()  # Changed to directed graph to capture directionality of communication
    try:
        for idx, row in slice_df.iterrows():
            if pd.isna(row['Src IP']) or pd.isna(row['Dst IP']):
                continue

            # Define the nodes connected by the edge
            edge_id = (row['Src IP'], row['Dst IP'])
                       
            src_node, dst_node = row['Src IP'], row['Dst IP']
            G.add_node(src_node, type='IP')
            G.add_node(dst_node, type='IP')

            # Define edge with attributes
            # Network Communication Edge
            G.add_edge(*edge_id, interaction='network_communication',
                       flow_id=row['Flow ID'], src_port=row['Src Port'], dst_port=row['Dst Port'],
                       protocol=row['Protocol'],
                       fwd_psh_flags=row['Fwd PSH Flags'], bwd_psh_flags=row['Bwd PSH Flags'],
                       flow_iat_max=row['Flow IAT Max'], flow_iat_mean=row['Flow IAT Mean'],
                       flow_iat_min=row['Flow IAT Min'], flow_iat_std=row['Flow IAT Std'],
                       fwd_iat_max=row['Fwd IAT Max'], fwd_iat_mean=row['Fwd IAT Mean'],
                       fwd_iat_min=row['Fwd IAT Min'], fwd_iat_std=row['Fwd IAT Std'],
                       bwd_iat_max=row['Bwd IAT Max'], bwd_iat_mean=row['Bwd IAT Mean'],
                       bwd_iat_min=row['Bwd IAT Min'], bwd_iat_std=row['Bwd IAT Std'],
                       active_max=row['Active Max'], active_mean=row['Active Mean'],
                       active_min=row['Active Min'], active_std=row['Active Std'])
            
            # Context of Device and Environment Edge
            G.add_edge(*edge_id, interaction='context',
                       timestamp=idx, idle_max=row['Idle Max'],  # using idx here
                       idle_mean=row['Idle Mean'], idle_min=row['Idle Min'], 
                       idle_std=row['Idle Std'])
            
            # Knowledge Graph Edge
            G.add_edge(*edge_id, interaction='knowledge',
                       down_up_ratio=row['Down/Up Ratio'], fwd_urg_flags=row['Fwd URG Flags'], bwd_urg_flags=row['Bwd URG Flags'])

    except KeyError as e:
        print(f"KeyError in slice {index}: {e} - missing data fields.")
        return f"KeyError in slice {index}: {e}"

    # Only save the graph if it contains any edges or nodes
    if G.number_of_edges() > 0 or G.number_of_nodes() > 0:
        filename = f"graph_{interval}_{index}.pkl"
        with open(filename, "wb") as f:
            pickle.dump(G, f)
        return filename
    else:
        return f"No data to save for slice {index}"

# Continue with processing if no columns are missing
if not missing_columns:
    time_slices = [g for _, g in df.groupby(pd.Grouper(freq=time_interval))]

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i, slice_df in enumerate(time_slices):
            futures.append(executor.submit(process_time_slice, slice_df, time_interval, i))

        for future in as_completed(futures):
            result = future.result()
            print(f"Saved: {result}")


In [None]:
#graph creation for Test

import pandas as pd
import networkx as nx
import pickle
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load the dataset
df = pd.read_csv('Ton_test_up.csv')

# Convert 'Timestamp' to datetime and set as index
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp', inplace=True)

# Sort the DataFrame by the index (Timestamp)
df.sort_index(inplace=True)

# Define time interval (Daily, hourly or per minutes ....)
time_interval = 'X'

# Required columns list including new features for edge attributes and labels
required_columns = [
    'Src IP', 'Dst IP', 'Flow ID', 'Src Port', 'Dst Port', 'Protocol',
    'Flow Duration', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Flow IAT Max',
    'Flow IAT Mean', 'Flow IAT Min', 'Flow IAT Std', 'Fwd IAT Max',
    'Fwd IAT Mean', 'Fwd IAT Min', 'Fwd IAT Std', 'Bwd IAT Max',
    'Bwd IAT Mean', 'Bwd IAT Min', 'Bwd IAT Std', 'Active Max',
    'Active Mean', 'Active Min', 'Active Std', 'Idle Max', 'Idle Mean',
    'Idle Min', 'Idle Std', 'Down/Up Ratio', 'Fwd URG Flags', 'Bwd URG Flags', 'Attack'
]

# Check for missing columns in the DataFrame
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"The following required columns are missing from the DataFrame: {missing_columns}")
else:
    print("All required columns are present in the DataFrame.")

def process_time_slice(slice_df, interval, index):
    G = nx.DiGraph()
    labels = {}  # Dictionary to store labels by edge

    try:
        for idx, row in slice_df.iterrows():
            if pd.isna(row['Src IP']) or pd.isna(row['Dst IP']):
                continue

            edge_id = (row['Src IP'], row['Dst IP'])

            # Define default attributes common to all edges
            common_attrs = {
                'flow_id': row['Flow ID'], 'src_port': row['Src Port'], 'dst_port': row['Dst Port'],
                'protocol': row['Protocol'], 'fwd_psh_flags': row['Fwd PSH Flags'],
                'bwd_psh_flags': row['Bwd PSH Flags'], 'flow_iat_max': row['Flow IAT Max'],
                'flow_iat_mean': row['Flow IAT Mean'], 'flow_iat_min': row['Flow IAT Min'],
                'flow_iat_std': row['Flow IAT Std'], 'fwd_iat_max': row['Fwd IAT Max'],
                'fwd_iat_mean': row['Fwd IAT Mean'], 'fwd_iat_min': row['Fwd IAT Min'],
                'fwd_iat_std': row['Fwd IAT Std'], 'bwd_iat_max': row['Bwd IAT Max'],
                'bwd_iat_mean': row['Bwd IAT Mean'], 'bwd_iat_min': row['Bwd IAT Min'],
                'bwd_iat_std': row['Bwd IAT Std'], 'active_max': row['Active Max'],
                'active_mean': row['Active Mean'], 'active_min': row['Active Min'],
                'active_std': row['Active Std']
            }

            # Network communication edge
            G.add_edge(*edge_id, **common_attrs, interaction='network_communication')

            # Context edge
            G.add_edge(*edge_id, **common_attrs, interaction='context',
                       timestamp=idx.timestamp(), idle_max=row['Idle Max'],
                       idle_mean=row['Idle Mean'], idle_min=row['Idle Min'],
                       idle_std=row['Idle Std'])

            # Knowledge graph edge
            G.add_edge(*edge_id, **common_attrs, interaction='knowledge',
                       down_up_ratio=row['Down/Up Ratio'], fwd_urg_flags=row['Fwd URG Flags'],
                       bwd_urg_flags=row['Bwd URG Flags'])

            # Store attack status for each edge
            labels[edge_id + ('network_communication',)] = row['Attack']
            labels[edge_id + ('context',)] = row['Attack']  # Assume same attack status or derived
            labels[edge_id + ('knowledge',)] = row['Attack']  # Assume same attack status or derived

    except KeyError as e:
        print(f"KeyError in slice {index}: {e} - row data: {row.to_dict()}")
        return f"KeyError in slice {index}: {e}"

    # Only save the graph if it contains any edges or nodes
    if G.number_of_edges() > 0 or G.number_of_nodes() > 0:
        graph_filename = f"graph_{interval}_{index}.pkl"
        with open(graph_filename, "wb") as gf:
            pickle.dump((G, labels), gf)  # Save both graph and labels together
        return graph_filename
    else:
        return f"No data to save for slice {index}"

# Continue with processing if no columns are missing
if not missing_columns:
    time_slices = [g for _, g in df.groupby(pd.Grouper(freq=time_interval))]

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i, slice_df in enumerate(time_slices):
            futures.append(executor.submit(process_time_slice, slice_df, time_interval, i))

        for future in as_completed(futures):
            result = future.result()
            if isinstance(result, str) and result.startswith("graph"):
                print(f"Saved Graph and Labels: {result}")
            else:
                print(result)

In [None]:
#community detection for train
import os
import pickle
import torch
from torch_geometric.utils import from_networkx
import networkx as nx
from tqdm.contrib.concurrent import process_map  # for progress bars with multiprocessing
from concurrent.futures import ProcessPoolExecutor
import random
import time  # Import the time module

def load_graph(graph_path):
    print(f"Loading graph from {graph_path}")
    with open(graph_path, 'rb') as f:
        nx_graph = pickle.load(f)
    return nx_graph

def directed_label_propagation(G, max_iter=100):
    # Initialize labels at random
    labels = {n: i for i, n in enumerate(G.nodes())}
    nodes = list(G.nodes())
    random.shuffle(nodes)  # Random node order per iteration

    for _ in range(max_iter):
        converged = True  # To check if any label changed in this iteration
        for node in nodes:
            # Get the labels of the inbound neighbors
            in_labels = [labels[n] for n in G.predecessors(node)]

            if not in_labels:
                continue

            # Find the most common label
            most_common = max(set(in_labels), key=in_labels.count)

            # Update the label of the node if different
            if labels[node] != most_common:
                labels[node] = most_common
                converged = False

        if converged:
            break  # Stop if no labels changed

    return labels

def detect_label_propagation(nx_graph):
    print("Starting label propagation...")
    start_time = time.time()
    labels = directed_label_propagation(nx_graph)
    end_time = time.time()

    # Creating communities from labels
    communities = {}
    for node, community_id in labels.items():
        if community_id not in communities:
            communities[community_id] = []
        communities[community_id].append(node)

    community_labels = labels
    result = {
        "num_large_communities": len([c for c in communities.values() if len(c) >= 3]),
        "duration": end_time - start_time,
        "community_labels": community_labels
    }
    return result

def add_node_features(G):
    # Handle MultiGraph directly for degrees and simplify for other metrics
    degrees = dict(G.degree())
    simple_G = nx.Graph(G)
    clustering = nx.clustering(simple_G)

    for node in G.nodes():
        G.nodes[node]['x'] = [
            degrees.get(node, 0),
            clustering.get(node, 0)
        ]

def ensure_edge_attributes(G, default_attributes):
    for u, v, data in G.edges(data=True):
        for attr in default_attributes:
            if attr not in data:
                data[attr] = default_attributes[attr]

def add_community_labels_to_nodes(G, community_labels):
    for node in G.nodes():
        G.nodes[node]['community'] = community_labels.get(node, -1)

def process_graph(graph_path):
    G = load_graph(graph_path)
    community_result = detect_label_propagation(G)
    add_community_labels_to_nodes(G, community_result['community_labels'])
    add_node_features(G)
    ensure_edge_attributes(G, default_attributes)

    # Convert to PyTorch Geometric data
    pyg_data = from_networkx(G)
    pyg_data.x = torch.tensor([G.nodes[node]['x'] for node in G.nodes()], dtype=torch.float)
    pyg_data.y = torch.tensor([G.nodes[node]['community'] for node in G.nodes()], dtype=torch.float)

    # Save the processed graph
    filename = f"{os.path.basename(graph_path)[:-4]}.pt"
    save_path = os.path.join(save_dir, filename)
    torch.save(pyg_data, save_path)
    return save_path

def main():
    files = [os.path.join(graph_dir, f) for f in os.listdir(graph_dir) if f.endswith('.pkl')]
    results = process_map(process_graph, files, max_workers=4, chunksize=1)

if __name__ == "__main__":
    graph_dir = 'address of directory'
    save_dir = 'address of directory'
    default_attributes = {
        'flow_id': None, 'src_port': None, 'dst_port': None, 'protocol': None,
        'flow_duration': None, 'fwd_psh_flags': None, 'bwd_psh_flags': None,
        'flow_iat_max': None, 'flow_iat_mean': None, 'flow_iat_min': None,
        'flow_iat_std': None, 'fwd_iat_max': None, 'fwd_iat_mean': None,
        'fwd_iat_min': None, 'fwd_iat_std': None, 'bwd_iat_max': None,
        'bwd_iat_mean': None, 'bwd_iat_min': None, 'bwd_iat_std': None,
        'active_max': None, 'active_mean': None, 'active_min': None,
        'active_std': None, 'timestamp': None, 'idle_max': None,
        'idle_mean': None, 'idle_min': None, 'idle_std': None,
        'down_up_ratio': None, 'fwd_urg_flags': None, 'bwd_urg_flags': None,
        'weight': 1.0
    }
    os.makedirs(save_dir, exist_ok=True)
    main()


In [None]:
#community detection for test
import os
import pickle
import torch
from torch_geometric.utils import from_networkx
import networkx as nx
from tqdm.contrib.concurrent import process_map  # for progress bars with multiprocessing
from concurrent.futures import ProcessPoolExecutor
import random
import time  # Import the time module

def load_graph(graph_path):
    print(f"Loading graph from {graph_path}")
    with open(graph_path, 'rb') as f:
        nx_graph = pickle.load(f)
    return nx_graph

def directed_label_propagation(G, labels, max_iter=100):
    # Initialize labels at random or use the provided labels
    if not labels:
        labels = {n: i for i, n in enumerate(G.nodes())}
    nodes = list(G.nodes())
    random.shuffle(nodes)  # Random node order per iteration

    for _ in range(max_iter):
        converged = True
        for node in nodes:
            in_labels = [labels[n] for n in G.predecessors(node) if n in labels]
            if not in_labels:
                continue
            most_common = max(set(in_labels), key=in_labels.count)
            if labels[node] != most_common:
                labels[node] = most_common
                converged = False
        if converged:
            break
    return labels

def detect_label_propagation(nx_graph, labels):
    print("Starting label propagation...")
    start_time = time.time()
    labels = directed_label_propagation(nx_graph, labels)
    end_time = time.time()

    communities = {}
    for node, community_id in labels.items():
        if community_id not in communities:
            communities[community_id] = []
        communities[community_id].append(node)

    result = {
        "num_large_communities": len([c for c in communities.values() if len(c) >= 3]),
        "duration": end_time - start_time,
        "community_labels": labels
    }
    return result

def process_graph(graph_path):
    G, labels = load_graph(graph_path)
    community_result = detect_label_propagation(G, labels)  # Correctly pass both graph and labels

    add_community_labels_to_nodes(G, community_result['community_labels'])
    add_node_features(G)
    ensure_edge_attributes(G, default_attributes)

    pyg_data = from_networkx(G)
    pyg_data.x = torch.tensor([G.nodes[node]['x'] for node in G.nodes()], dtype=torch.float)
    pyg_data.y = torch.tensor([G.nodes[node]['community'] for node in G.nodes()], dtype=torch.float) # Label of Community; used in training and test
    pyg_data.z = torch.tensor([labels[(u, v, 'network_communication')] for u, v in G.edges()], dtype=torch.float) # Attack yes or no; used in prediction

    filename = f"{os.path.basename(graph_path)[:-4]}.pt"
    save_path = os.path.join(save_dir, filename)
    torch.save(pyg_data, save_path)
    return save_path

def main():
    files = [os.path.join(graph_dir, f) for f in os.listdir(graph_dir) if f.endswith('.pkl')]
    results = process_map(process_graph, files, max_workers=4, chunksize=1)

if __name__ == "__main__":
    graph_dir = 'address of directory'
    save_dir = 'address of directory'
    default_attributes = {
        'flow_id': None, 'src_port': None, 'dst_port': None, 'protocol': None,
        'flow_duration': None, 'fwd_psh_flags': None, 'bwd_psh_flags': None,
        'flow_iat_max': None, 'flow_iat_mean': None, 'flow_iat_min': None,
        'flow_iat_std': None, 'fwd_iat_max': None, 'fwd_iat_mean': None,
        'fwd_iat_min': None, 'fwd_iat_std': None, 'bwd_iat_max': None,
        'bwd_iat_mean': None, 'bwd_iat_min': None, 'bwd_iat_std': None,
        'active_max': None, 'active_mean': None, 'active_min': None,
        'active_std': None, 'timestamp': None, 'idle_max': None,
        'idle_mean': None, 'idle_min': None, 'idle_std': None,
        'down_up_ratio': None, 'fwd_urg_flags': None, 'bwd_urg_flags': None,
        'weight': 1.0
    }
    os.makedirs(save_dir, exist_ok=True)
    main()