In [59]:
!pip install torch torch-geometric



In [60]:
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv, GATConv, GINConv
from torch_geometric.datasets import Planetoid, DeezerEurope
from torch_geometric.utils import train_test_split_edges, negative_sampling, subgraph
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset_name = "CoraFull" # "DeezerEurope"
if dataset_name == "CoraFull":
    dataset = Planetoid(root='CoraFull', name='Cora')
elif dataset_name == "DeezerEurope":
    dataset = DeezerEurope(root='DeezerEurope')

data = dataset[0]
train_ratio, val_ratio, test_ratio = 0.6, 0.2, 0.2
hidden_dim = 64
epochs = 100
learning_rate = 0.01

In [61]:
# Assume `data` is the Data object from your torch geometric dataset.
# train_test_split splits the graph by its nodes.
# The below example code sets test set size to 50% of the graph.
# The split from the doc was 60:20:20. You may or may not keep a validation portion,
# in which case you could have a test set of size 40%.
num_nodes = data.num_nodes
train_nodes, test_nodes = train_test_split(range(num_nodes), test_size=test_ratio, random_state=42)
train_nodes, val_nodes = train_test_split(train_nodes, test_size=val_ratio / (train_ratio + val_ratio), random_state=42)


# The below code produces the induced subgraph of train set nodes.
# In the process, it also produces a separated edge index.
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_nodes] = True
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask[val_nodes] = True
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask[test_nodes] = True

train_edge_index, _ = subgraph(train_nodes, data.edge_index, relabel_nodes=True)

# If you need the features from only the nodes in the training set,
# use the train mask.
feats = data.x[train_mask]


# The code below will sample non-neighbours from the train portion of the graph.
# The non-neighbour set will be the same size as the training edge index.
# The mask ensures that only the nodes in the training set are used.
neg_edge_index = negative_sampling(edge_index=train_edge_index,
                                   num_nodes=data.num_nodes,
                                   num_neg_samples=train_edge_index.size(1))

val_edge_index, _ = subgraph(val_nodes, data.edge_index, relabel_nodes=False)
val_neg_edge_index = negative_sampling(
    edge_index=val_edge_index,
    num_nodes=data.num_nodes,
    num_neg_samples=val_edge_index.size(1),
)

test_edge_index, _ = subgraph(test_nodes, data.edge_index, relabel_nodes=False)
test_neg_edge_index = negative_sampling(
    edge_index=test_edge_index,
    num_nodes=data.num_nodes,
    num_neg_samples=test_edge_index.size(1),
)
# During inference, build a subset of nodes Q from `test_nodes`.
# In this set, each node should be part of atleast one triangle.
# Then you pair each node in Q with each other node in the test set.
# Score and rank for result retrieval.

In [62]:
# GNN Model Definition
class GNNModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, gnn_type="GCN"):
        super(GNNModel, self).__init__()
        if gnn_type == "GCN":
            self.conv1 = GCNConv(in_channels, hidden_channels)
            self.conv2 = GCNConv(hidden_channels, out_channels)
        elif gnn_type == "GAT":
            self.conv1 = GATConv(in_channels, hidden_channels)
            self.conv2 = GATConv(hidden_channels, out_channels)
        elif gnn_type == "GIN":
            self.conv1 = GINConv(torch.nn.Sequential(
                torch.nn.Linear(in_channels, hidden_channels),
                torch.nn.ReLU(),
                torch.nn.Linear(hidden_channels, hidden_channels)
            ))
            self.conv2 = GINConv(torch.nn.Sequential(
                torch.nn.Linear(hidden_channels, out_channels),
                torch.nn.ReLU(),
                torch.nn.Linear(out_channels, out_channels)
            ))
        else:
            raise ValueError("Unsupported GNN type")

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

In [63]:
def auc_loss(model, data, train_edge_index, neg_edge_index, margin=0.25):
    node_embeddings = model(data.x, train_edge_index)

    # Positive edge scores
    pos_u, pos_v = train_edge_index
    pos_scores = F.cosine_similarity(node_embeddings[pos_u], node_embeddings[pos_v])

    # Negative edge scores
    neg_u, neg_v = neg_edge_index
    neg_scores = F.cosine_similarity(node_embeddings[neg_u], node_embeddings[neg_v])

    # Margin-based ranking loss
    loss = F.relu(margin + neg_scores - pos_scores).sum()
    return loss

In [64]:
def auc_loss_eval(model, data, train_edge_index, neg_edge_index, margin=0.25):
    model.eval()
    with torch.no_grad():
        node_embeddings = model(data.x, train_edge_index)

        # Positive edge scores
        pos_u, pos_v = train_edge_index
        pos_scores = F.cosine_similarity(node_embeddings[pos_u], node_embeddings[pos_v])

        # Negative edge scores
        neg_u, neg_v = neg_edge_index
        neg_scores = F.cosine_similarity(node_embeddings[neg_u], node_embeddings[neg_v])

        # Margin-based ranking loss
        loss = F.relu(margin + neg_scores - pos_scores).sum()
    return loss

In [66]:
def train(model, data, optimizer, train_edge_index):
    model.train()
    optimizer.zero_grad()
    neg_edge_index = negative_sampling(
        edge_index=train_edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=train_edge_index.size(1),
    )
    loss = auc_loss(model, data, train_edge_index, neg_edge_index)
    loss.backward()
    optimizer.step()
    return loss.item()

In [67]:
for gnn_model in ["GCN", "GAT", "GIN"]:
    print(f"GNN Model: {gnn_model}")
    model = GNNModel(data.num_features, hidden_dim, hidden_dim, gnn_type=gnn_model).to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Main Training
    for epoch in range(1, epochs + 1):
        loss = train(model, data, optimizer, train_edge_index)
        if epoch % 10 == 0:
            val_loss = auc_loss_eval(model, data, val_edge_index, val_neg_edge_index)
            print(f"Epoch {epoch:2d}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}") # Val AUC: {val_auc:.4f}")

    # Final Test
    test_loss = auc_loss_eval(model, data, test_edge_index, test_neg_edge_index)
    print(f"Final Test Loss: {test_loss:.4f}")

GNN Model: GCN
Epoch 10, Loss: 5.7004, Val Loss: 0.0664
Epoch 20, Loss: 2.4959, Val Loss: 0.1471
Epoch 30, Loss: 1.7200, Val Loss: 0.0282
Epoch 40, Loss: 1.7234, Val Loss: 0.0478
Epoch 50, Loss: 1.2449, Val Loss: 0.0000
Epoch 60, Loss: 1.6298, Val Loss: 0.0000
Epoch 70, Loss: 1.5178, Val Loss: 0.1735
Epoch 80, Loss: 0.5371, Val Loss: 0.2140
Epoch 90, Loss: 1.7140, Val Loss: 0.0098
Epoch 100, Loss: 0.7614, Val Loss: 0.0000
Final Test Loss: 0.1913
GNN Model: GAT
Epoch 10, Loss: 4.9183, Val Loss: 0.0257
Epoch 20, Loss: 2.7118, Val Loss: 0.2995
Epoch 30, Loss: 2.1432, Val Loss: 0.1879
Epoch 40, Loss: 2.5193, Val Loss: 0.3037
Epoch 50, Loss: 2.0421, Val Loss: 0.2762
Epoch 60, Loss: 1.6130, Val Loss: 0.2756
Epoch 70, Loss: 2.2850, Val Loss: 0.0729
Epoch 80, Loss: 1.6052, Val Loss: 0.0518
Epoch 90, Loss: 0.9481, Val Loss: 0.2684
Epoch 100, Loss: 1.4728, Val Loss: 0.1775
Final Test Loss: 0.4238
GNN Model: GIN
Epoch 10, Loss: 348.9990, Val Loss: 38.2055
Epoch 20, Loss: 223.3519, Val Loss: 15.69