In [3]:
pip install networkx

Collecting networkx
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Downloading networkx-3.4.2-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: networkx
Successfully installed networkx-3.4.2
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp311-cp311-macosx_14_0_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.15.2-cp311-cp311-macosx_14_0_x86_64.whl (25.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.1/25.1 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached threadpoolctl-3.6.0-py3-none-any.whl (18 k

In [17]:
import os
import numpy as np
import networkx as nx
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
import json

def load_graphs_from_edgelist(data_dir, label_dir):
    graphs, labels = [], []
    
    for root, _, files in os.walk(data_dir):
        for file in files:
            if not file.endswith(".edgelist") or file == ".DS_Store":
                continue
            
            graph_path = os.path.join(root, file)
            rel_path = os.path.relpath(graph_path, data_dir)
            path_parts = rel_path.split(os.sep)
            
            if len(path_parts) < 2:
                print(f"Skipping invalid path: {graph_path}")
                continue
            
            category = path_parts[0]
            sha256 = os.path.splitext(file)[0]
            label = 0 if category.lower() == 'benign' else 1
            
            try:
                graph = nx.read_edgelist(graph_path, nodetype=int, encoding="utf-8")
                graphs.append(graph)
                labels.append(label)
            except Exception as e:
                print(f"Error loading {graph_path}: {e}")
    
    return graphs, np.array(labels)

# Convert Graphs to Adjacency Matrices
def graph_to_adj_matrix(graph, max_nodes=100):
    adj_matrix = np.zeros((max_nodes, max_nodes))
    nodes = list(graph.nodes())[:max_nodes]
    node_map = {node: i for i, node in enumerate(nodes)}
    
    for edge in graph.edges():
        if edge[0] in node_map and edge[1] in node_map:
            i, j = node_map[edge[0]], node_map[edge[1]]
            adj_matrix[i, j] = 1
            adj_matrix[j, i] = 1  # Ensure symmetry
    
    return adj_matrix

In [19]:
import os
print(os.listdir("/Users/danigeorge/Documents/HPE Project/malnet-graphs-tiny"))


['addisplay', '.DS_Store', 'trojan', 'downloader', 'benign', 'all-less-than-5k-nodes.txt', 'adware']


In [21]:
# Step 2: Create Training and Testing Data
def create_datasets(graphs, labels, test_size=0.2, max_nodes=100):
    data = np.array([graph_to_adj_matrix(g, max_nodes) for g in graphs])
    data = np.expand_dims(data, axis=-1)  # Add channel dimension
    labels = np.array(labels)
    
    return train_test_split(data, labels, test_size=test_size, random_state=42)

# Custom callback to compute additional metrics per epoch
class MetricsCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
        val_predictions = (self.model.predict(self.validation_data[0]) > 0.5).astype(int)
        val_labels = self.validation_data[1]
        logs['val_f1'] = f1_score(val_labels, val_predictions)
        logs['val_auc'] = roc_auc_score(val_labels, val_predictions)
        logs['val_precision'] = precision_score(val_labels, val_predictions)
        logs['val_recall'] = recall_score(val_labels, val_predictions)
        print(f"Epoch {epoch+1}: Val F1: {logs['val_f1']:.4f}, Val AUC: {logs['val_auc']:.4f}, Val Precision: {logs['val_precision']:.4f}, Val Recall: {logs['val_recall']:.4f}")


In [23]:
# Step 3: Define a CNN Model with Dropout and Early Stopping
def build_cnn_model(input_shape):
    model = keras.Sequential([
        layers.Conv2D(16, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.Dropout(0.2),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.Dropout(0.2),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam', 
        loss='binary_crossentropy', 
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')]
    )
    return model

In [25]:
# Step 4: Train and Evaluate the Model
def train_model(train_data, train_labels, test_data, test_labels, epochs=35, batch_size=32):
    input_shape = train_data.shape[1:]
    model = build_cnn_model(input_shape)
    
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(
        train_data, train_labels, 
        epochs=epochs, batch_size=batch_size, 
        validation_data=(test_data, test_labels),
        callbacks=[early_stopping]
    )
    
    predictions = model.predict(test_data).flatten()
    pred_labels = (predictions > 0.5).astype(int)
    
    val_accuracy = np.mean(pred_labels == test_labels)
    val_f1 = f1_score(test_labels, pred_labels)
    val_auc = roc_auc_score(test_labels, predictions)
    val_precision = precision_score(test_labels, pred_labels)
    val_recall = recall_score(test_labels, pred_labels)
    
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation F1 Score: {val_f1:.4f}")
    print(f"Validation AUC: {val_auc:.4f}")
    print(f"Validation Precision: {val_precision:.4f}")
    print(f"Validation Recall: {val_recall:.4f}")
    
    return model, history

# Load Data
data_dir = "/Users/danigeorge/Documents/HPE Project/malnet-graphs-tiny"
label_dir = "/Users/danigeorge/Documents/HPE Project/malnet-labels"
graphs, labels = load_graphs_from_edgelist(data_dir, label_dir)
train_data, test_data, train_labels, test_labels = create_datasets(graphs, labels)

# Train Model
model, history = train_model(train_data, train_labels, test_data, test_labels)

Epoch 1/35
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 1s/step - accuracy: 0.7763 - auc: 0.7244 - loss: 0.4558 - precision: 0.8066 - recall: 0.9428 - val_accuracy: 0.8260 - val_auc: 0.8518 - val_loss: 0.3759 - val_precision: 0.8348 - val_recall: 0.9722
Epoch 2/35
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 1s/step - accuracy: 0.8552 - auc: 0.9010 - loss: 0.2975 - precision: 0.8723 - recall: 0.9635 - val_accuracy: 0.8330 - val_auc: 0.8631 - val_loss: 0.3545 - val_precision: 0.8480 - val_recall: 0.9608
Epoch 3/35
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 952ms/step - accuracy: 0.9122 - auc: 0.9634 - loss: 0.2073 - precision: 0.9203 - recall: 0.9744 - val_accuracy: 0.8420 - val_auc: 0.8646 - val_loss: 0.4058 - val_precision: 0.8527 - val_recall: 0.9671
Epoch 4/35
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 1s/step - accuracy: 0.9479 - auc: 0.9867 - loss: 0.1234 - precision: 0.9571 - recall: 0