In [3]:

!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [4]:
import pandas as pd

# Load your node embedding data
nodes_df = pd.read_csv("/content/node embedding.csv")


nodes_df['label_binary'] = nodes_df['label'].apply(lambda x: 1 if x in [1, 2, 3, 4] else 0)


nodes_df.to_csv("/content/node_embedding_binary_label.csv", index=False)

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.nn import GCNConv
import pandas as pd
import numpy as np
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim // 2)
        self.conv3 = GCNConv(hidden_dim // 2, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.batch_norm1 = nn.BatchNorm1d(hidden_dim)
        self.batch_norm2 = nn.BatchNorm1d(hidden_dim // 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = self.batch_norm1(x)
        x = F.relu(x)
        x = self.dropout(x)

        x = self.conv2(x, edge_index)
        x = self.batch_norm2(x)
        x = F.relu(x)
        x = self.dropout(x)

        x = self.conv3(x, edge_index)

        return F.log_softmax(x, dim=1)

def load_and_prepare_data(nodes_path, edges_path):
    """
    Load and prepare data for GCN node classification with proper index mapping
    """
    # Load nodes data
    nodes_df = pd.read_csv(nodes_path)

    # Create node ID mapping
    unique_nodes = nodes_df['node_id'].unique()
    node_id_to_idx = {node_id: idx for idx, node_id in enumerate(unique_nodes)}

    # Separate features and labels
    feature_columns =  nodes_df.columns.tolist()

    # Create node features tensor
    node_features = nodes_df[feature_columns].values
    x = torch.FloatTensor(node_features)

    # Get labels (assuming you have a 'label' column)
    labels = nodes_df['label_binary'].values
    y = torch.LongTensor(labels)

    # Load edges data
    edges_df = pd.read_csv(edges_path)

    # Map node IDs to indices
    edge_index_source = [node_id_to_idx[node_id] for node_id in edges_df['node1']]
    edge_index_target = [node_id_to_idx[node_id] for node_id in edges_df['node2']]

    # Create edge index tensor
    edge_index = torch.tensor([edge_index_source, edge_index_target], dtype=torch.long)

    # Create PyG Data object
    data = Data(x=x,
                edge_index=edge_index,
                y=y,
                node_id_to_idx=node_id_to_idx)

    print(f"Number of nodes: {x.shape[0]}")
    print(f"Number of edges: {edge_index.shape[1]}")
    print(f"Number of node features: {x.shape[1]}")
    print(f"Number of classes: {len(torch.unique(y))}")
    print(f"Edge index range: [{edge_index.min()}, {edge_index.max()}]")

    return data

def create_train_val_test_masks(data, train_size=0.5, val_size=0.45):
    """Create masks for train/validation/test split"""
    num_nodes = data.x.size(0)
    indices = np.arange(num_nodes)

    # Split indices into train, validation, and test
    train_idx, temp_idx = train_test_split(indices, train_size=train_size, random_state=42)
    val_idx, test_idx = train_test_split(temp_idx, train_size=val_size/(1-train_size), random_state=42)

    # Create boolean masks
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    train_mask[train_idx] = True
    val_mask[val_idx] = True
    test_mask[test_idx] = True

    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask

    return data

def train_model(model, data, epochs=2, lr=0.00001):
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    criterion = nn.NLLLoss()

    for epoch in range(epochs):
        # Training
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            out = model(data)
            val_loss = criterion(out[data.val_mask], data.y[data.val_mask])

            # Get predictions
            pred = out.argmax(dim=1)
            train_acc = accuracy_score(data.y[data.train_mask].numpy(),
                                     pred[data.train_mask].numpy())
            val_acc = accuracy_score(data.y[data.val_mask].numpy(),
                                   pred[data.val_mask].numpy())

        if epoch % 20 == 0:
            print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, '
                  f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

    return model

def evaluate_model(model, data):
    """Evaluate the model on test set"""
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)

        # Get test accuracy
        test_acc = accuracy_score(data.y[data.test_mask].numpy(),
                                pred[data.test_mask].numpy())

        # Get detailed classification report
        report = classification_report(data.y[data.test_mask].numpy(),
                                    pred[data.test_mask].numpy())

    return test_acc, report, pred

def main(nodes_path, edges_path):
    # Load and prepare data
    data = load_and_prepare_data(nodes_path, edges_path)

    # Create train/val/test splits
    data = create_train_val_test_masks(data)

    # Initialize model
    input_dim = data.x.size(1)  # Number of node features
    hidden_dim = 256*16            # Adjustable hidden dimension
    num_classes = len(torch.unique(data.y))  # Number of unique classes

    model = GCN(input_dim=input_dim,
                hidden_dim=hidden_dim,
                num_classes=num_classes)

    # Train model
    model = train_model(model, data)

    # Evaluate model
    test_acc, classification_rep, predictions = evaluate_model(model, data)

    print("\nTest Accuracy:", test_acc)
    # print("\nClassification Report:")
    # print(classification_rep)

    # Save predictions
    all_predictions = pd.DataFrame({
        'node_id': list(data.node_id_to_idx.keys()),
        'predicted_class': predictions.numpy()
    })
    all_predictions.to_csv('predictions.csv', index=False)

    return model, data

if __name__ == "__main__":
    nodes_path = "/content/node_embedding_binary_label.csv"
    edges_path = "/content/edge embedding.csv"

    model, data = main(nodes_path, edges_path)

Number of nodes: 150
Number of edges: 281
Number of node features: 58
Number of classes: 2
Edge index range: [0, 149]
Epoch 000, Loss: 1.2054, Train Acc: 0.3333, Val Loss: 4.0742, Val Acc: 0.3134

Test Accuracy: 0.375


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
