# Create the train datastracture for 3 edges

In [1]:
import pandas as pd

# Specify the features to keep
features_to_keep = [
 ######list of selected features####
]

# Load your dataset
data = pd.read_csv('train_data.csv')  # Replace 'your_data.csv' with your actual file name

# Keep only the specified features
filtered_data = data[features_to_keep]

# Convert 'Timestamp' to datetime
filtered_data['Timestamp'] = pd.to_datetime(filtered_data['Timestamp'])

# Order the data by 'Timestamp'
filtered_data = filtered_data.sort_values(by='Timestamp')

# Save the temporally ordered data to a new file
filtered_data.to_csv('filtered_train_3edge.csv', index=False)

print("Filtered and temporally ordered data saved to 'filtered_train_3edge.csv'.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Timestamp'] = pd.to_datetime(filtered_data['Timestamp'])


Filtered and temporally ordered data saved to 'filtered_train_3edge.csv'.


In [2]:
#check for inside of csv (just for test, no need for run)
import pandas as pd

# Load the CSV file
file_path = "filtered_train_3edge.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Check if the label column contains '1'
label_column = 'Label'  # Replace with the actual label column name if different
if label_column in df.columns:
    label_distribution = df[label_column].value_counts()
    print("Label Distribution:")
    print(label_distribution)

    if 1 in label_distribution.index:
        print("The CSV contains label '1'.")
    else:
        print("The CSV does NOT contain label '1'.")
else:
    print(f"'{label_column}' column not found in the CSV.")

Label Distribution:
Label
1    1978039
0    1208711
Name: count, dtype: int64
The CSV contains label '1'.


# created hourly graph with 3 edges from train dataset

In [3]:
import pandas as pd
import networkx as nx
import os

def create_test_graphs_edge_labels(df, output_dir):
    """
    Split the DataFrame into hourly slices and create graphs for each slice.
    Each edge gets a valid label (e.g., 0 or 1) read from the DataFrame.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame with temporal data.
        output_dir (str): Directory to save the graphs.
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Group the DataFrame into hourly slices using the datetime index.
    # (Assumes the DataFrame index is already a DateTimeIndex)
    time_slices = [g for _, g in df.groupby(pd.Grouper(freq='H'))]
    
    for slice_index, slice_df in enumerate(time_slices):
        if slice_df.empty:
            continue

        # Print value counts of the 'Label' column in this time-slice.
        print(f"Hour {slice_index}:")
        print(slice_df['Label'].value_counts())
        
        # Create a MultiDiGraph for this time-slice.
        G = nx.MultiDiGraph()

        for _, row in slice_df.iterrows():
            src_ip = row['Src IP']
            dst_ip = row['Dst IP']
            
            # Convert the label to an int (if missing or invalid, you can decide a fallback; here we assume it is valid)
            try:
                label = int(row['Label'])
            except Exception as e:
                print(f"Skipping row due to invalid label: {row['Label']}; error: {e}")
                continue

            if pd.isna(src_ip) or pd.isna(dst_ip):
                continue

            # Add nodes if not already present.
            if not G.has_node(src_ip):
                G.add_node(src_ip)
            if not G.has_node(dst_ip):
                G.add_node(dst_ip)

            # Add edges for different interactions.
            # 1. Network Edge
            G.add_edge(src_ip, dst_ip, key='network',
                       label=label,
                 ######list of selected features####
                       interaction='network_communication')

            # 2. Context Edge
            G.add_edge(src_ip, dst_ip, key='context',
                       label=label,
             ######list of selected features####
                       interaction='context')

            # 3. Knowledge Edge
            G.add_edge(src_ip, dst_ip, key='knowledge',
                       label=label,
             ######list of selected features####
                       interaction='knowledge')

        # Save the graph as a .gpickle file.
        graph_path = os.path.join(output_dir, f"test_graph_hour_{slice_index}.gpickle")
        nx.write_gpickle(G, graph_path)
        print(f"Test graph for hour {slice_index} saved to {graph_path}")

# Usage Example for graph creation
if __name__ == "__main__":
    # Read CSV and prepare DataFrame.
    df_test = pd.read_csv('filtered_train_3edge.csv')
    df_test['Timestamp'] = pd.to_datetime(df_test['Timestamp'])
    # Set Timestamp as index and sort (required for grouping by hour)
    df_test = df_test.set_index('Timestamp').sort_index()

    output_test_dir = "3ed_trai_h_graphs"
    create_test_graphs_edge_labels(df_test, output_test_dir)


Hour 0:
Label
0    558
Name: count, dtype: int64
Test graph for hour 0 saved to 3ed_trai_h_graphs/test_graph_hour_0.gpickle
Hour 1:
Label
0    475
Name: count, dtype: int64
Test graph for hour 1 saved to 3ed_trai_h_graphs/test_graph_hour_1.gpickle
Hour 2:
Label
0    551
Name: count, dtype: int64
Test graph for hour 2 saved to 3ed_trai_h_graphs/test_graph_hour_2.gpickle
Hour 3:
Label
0    1251
Name: count, dtype: int64
Test graph for hour 3 saved to 3ed_trai_h_graphs/test_graph_hour_3.gpickle
Hour 4:
Label
0    1513
Name: count, dtype: int64
Test graph for hour 4 saved to 3ed_trai_h_graphs/test_graph_hour_4.gpickle
Hour 5:
Label
0    1397
Name: count, dtype: int64
Test graph for hour 5 saved to 3ed_trai_h_graphs/test_graph_hour_5.gpickle
Hour 6:
Label
0    1468
Name: count, dtype: int64
Test graph for hour 6 saved to 3ed_trai_h_graphs/test_graph_hour_6.gpickle
Hour 7:
Label
0    1382
Name: count, dtype: int64
Test graph for hour 7 saved to 3ed_trai_h_graphs/test_graph_hour_7.gpickle
Hou

# Community detection for graphs and then update the graph with the label of community for each node

In [4]:
import networkx as nx
import os

def detect_and_label_communities_lpa(graph):
    """
    Perform community detection using the Label Propagation Algorithm (LPA) and label nodes with community IDs.
    Adds 'x' attribute based on the 'community' label.

    Parameters:
        graph (nx.MultiDiGraph): Input graph.

    Returns:
        graph (nx.MultiDiGraph): Updated graph with community labels and 'x' attributes.
    """
    # Convert MultiDiGraph to Graph (undirected graph for LPA)
    undirected_graph = nx.Graph(graph)

    # Perform community detection using LPA
    communities = nx.community.label_propagation_communities(undirected_graph)

    # Assign community labels to nodes and add 'x' attribute
    for community_id, community in enumerate(communities):
        for node in community:
            graph.nodes[node]['community'] = community_id
            graph.nodes[node]['x'] = [community_id]  # 'x' is a feature; wrap in a list for PyTorch Geometric compatibility

    return graph


def process_graphs_with_lpa(input_dir, output_dir):
    """
    Detect communities using LPA, update graphs with community labels, and add 'x' attribute.
    
    Parameters:
        input_dir (str): Directory containing input graphs.
        output_dir (str): Directory to save updated graphs.
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Process each graph file in the input directory
    for graph_file in os.listdir(input_dir):
        if not graph_file.endswith('.gpickle'):
            continue
        
        # Load the graph
        graph_path = os.path.join(input_dir, graph_file)
        G = nx.read_gpickle(graph_path)

        # Detect communities using LPA and label nodes
        G = detect_and_label_communities_lpa(G)

        # Save the updated graph
        updated_graph_path = os.path.join(output_dir, graph_file)
        nx.write_gpickle(G, updated_graph_path)
        print(f"Updated graph with LPA communities and 'x' attribute saved to {updated_graph_path}")


# Example usage
if __name__ == "__main__":
    # Input directory containing graphs
    input_graph_dir = "3ed_trai_h_graphs"

    # Output directory for updated graphs
    output_graph_dir = "3ed_trai_h_graphs_commun"

    # Process graphs and add community labels using LPA
    process_graphs_with_lpa(input_graph_dir, output_graph_dir)



Updated graph with LPA communities and 'x' attribute saved to 3ed_trai_h_graphs_commun/test_graph_hour_14.gpickle
Updated graph with LPA communities and 'x' attribute saved to 3ed_trai_h_graphs_commun/test_graph_hour_17.gpickle
Updated graph with LPA communities and 'x' attribute saved to 3ed_trai_h_graphs_commun/test_graph_hour_13.gpickle
Updated graph with LPA communities and 'x' attribute saved to 3ed_trai_h_graphs_commun/test_graph_hour_18.gpickle
Updated graph with LPA communities and 'x' attribute saved to 3ed_trai_h_graphs_commun/test_graph_hour_1.gpickle
Updated graph with LPA communities and 'x' attribute saved to 3ed_trai_h_graphs_commun/test_graph_hour_12.gpickle
Updated graph with LPA communities and 'x' attribute saved to 3ed_trai_h_graphs_commun/test_graph_hour_11.gpickle
Updated graph with LPA communities and 'x' attribute saved to 3ed_trai_h_graphs_commun/test_graph_hour_6.gpickle
Updated graph with LPA communities and 'x' attribute saved to 3ed_trai_h_graphs_commun/tes

# convert Multigraph to hetrodata

In [5]:
import torch
from torch_geometric.data import HeteroData
import networkx as nx
import os

def multiDiGraph_to_hetero_with_label(G: nx.MultiDiGraph) -> HeteroData:
    """
    Converts a MultiDiGraph with multiple edge types to a HeteroData object.
    Preserves the 'label' field in data[rel_type].edge_label.
    """
    data = HeteroData()
    node_mapping = {node: i for i, node in enumerate(G.nodes())}
    data['ip'].num_nodes = G.number_of_nodes()

    # Add node-level features
    x = []
    community_labels = []
    for node in G.nodes():
        community = G.nodes[node].get('community', -1)
        community_labels.append(community)
        x.append([community])
    data['ip'].community = torch.tensor(community_labels, dtype=torch.long)
    data['ip'].x = torch.tensor(x, dtype=torch.float)

    # Process each edge from G.
    for u, v, key, edge_attrs in G.edges(data=True, keys=True):
        src = node_mapping[u]
        dst = node_mapping[v]
        rel_type = ('ip', key, 'ip')
        if rel_type not in data.edge_types:
            data[rel_type].edge_index = []
            data[rel_type].edge_attr = []
            data[rel_type].edge_label = []  # Container for the label

        data[rel_type].edge_index.append([src, dst])
        feature_vec = []
        if key == 'network':
            for attr_name in [ ######list of selected features####]:
                feature_vec.append(edge_attrs.get(attr_name, 0))
        elif key == 'context':
            for attr_name in [ ######list of selected features####]:
                feature_vec.append(edge_attrs.get(attr_name, 0))
        elif key == 'knowledge':
            for attr_name in [ ######list of selected features####]:
                feature_vec.append(edge_attrs.get(attr_name, 0))
        data[rel_type].edge_attr.append(feature_vec)
        # Save the label—this should now be valid (0 or 1)
        data[rel_type].edge_label.append(edge_attrs.get('label', -1))

    # Convert lists to tensors.
    for rel_type in data.edge_types:
        data[rel_type].edge_index = torch.tensor(data[rel_type].edge_index, dtype=torch.long).t().contiguous()
        if data[rel_type].edge_attr:
            data[rel_type].edge_attr = torch.tensor(data[rel_type].edge_attr, dtype=torch.float)
        if data[rel_type].edge_label:
            data[rel_type].edge_label = torch.tensor(data[rel_type].edge_label, dtype=torch.long)
    return data

def process_and_save_hetero_graphs_with_label(input_dir, output_dir):
    """
    Converts all .gpickle graphs in a directory to HeteroData objects and saves them as .pt,
    preserving the 'label' field in data[rel_type].edge_label.
    """
    os.makedirs(output_dir, exist_ok=True)
    for graph_file in os.listdir(input_dir):
        if not graph_file.endswith('.gpickle'):
            continue
        graph_path = os.path.join(input_dir, graph_file)
        G = nx.read_gpickle(graph_path)
        hetero_data = multiDiGraph_to_hetero_with_label(G)
        hetero_path = os.path.join(output_dir, graph_file.replace('.gpickle', '.pt'))
        torch.save(hetero_data, hetero_path)
        print(f"Saved HeteroData with labels to {hetero_path}")

if __name__ == "__main__":
    input_test_dir = "3ed_trai_h_graphs_commun"         # Input .gpickle files (with communities added)
    output_test_pt_dir = "3ed_trai_h_graphs_hetero_graphs" # Output .pt files
    process_and_save_hetero_graphs_with_label(input_test_dir, output_test_pt_dir)


Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_14.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_17.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_13.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_18.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_1.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_12.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_11.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_6.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_5.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_4.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_graphs/test_graph_hour_2.pt
Saved HeteroData with labels to 3ed_trai_h_graphs_hetero_gr