In [1]:
!pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.5.1+cu124.html --no-cache-dir
!pip install pyg-lib -f https://data.pyg.org/whl/torch-2.5.1+cu124.html --no-cache-dir
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu124.html
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu124.html


In [2]:
import torch
torch.__version__, torch.version.cuda

('2.5.1+cu124', '12.4')

In [3]:
import torch_sparse

In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GINConv, SAGEConv
from torch_geometric.data import Data
# Import multiple metrics from sklearn
from sklearn.metrics import roc_auc_score, f1_score, average_precision_score, ndcg_score
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import negative_sampling, coalesce
from torch_geometric.loader import LinkNeighborLoader
from tqdm.auto import tqdm
import numpy as np
import random

def set_seed(seed: int = 42):
    """Sets the seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # for multi-GPU
        # Optional: If you need deterministic behavior, uncomment these lines.
        # This might slow down training.
        # torch.backends.cudnn.deterministic = True
        # torch.backends.cudnn.benchmark = False
    print(f"Random seed set to {seed}")

set_seed(42)

# Determine the device to use (single GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Random seed set to 42
Using device: cuda


In [5]:

class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, model_type='GIN'):
        super(LinkPredictor, self).__init__()
        self.num_layers = num_layers # Store the number of layers as an attribute
        self.convs = torch.nn.ModuleList()
        if model_type == 'GIN':
            for i in range(num_layers):
                # Using Linear layers within GINConv
                nn_GIN = torch.nn.Sequential(
                    torch.nn.Linear(in_channels if i == 0 else hidden_channels, hidden_channels),
                    torch.nn.BatchNorm1d(hidden_channels),
                    torch.nn.ReLU(),
                    torch.nn.Linear(hidden_channels, hidden_channels),
                    torch.nn.BatchNorm1d(hidden_channels),
                    torch.nn.ReLU()
                )
                self.convs.append(GINConv(nn_GIN))
        elif model_type == 'SAGE':
            for i in range(num_layers):
                # SAGEConv layers
                self.convs.append(SAGEConv(in_channels if i == 0 else hidden_channels, hidden_channels))
        else:
            raise ValueError("Model type must be 'GIN' or 'SAGE'")

        # Linear layer for prediction (used in the predict method)
        # Note: The original code defined this but didn't use it in predict.
        # The predict method uses dot product. If you intended to use a linear layer
        # for prediction, you would modify the predict method.
        # Keeping it here as per original code structure, though it's unused in predict.
        self.lin = torch.nn.Linear(2 * hidden_channels, out_channels)
        self.model_type = model_type

    def forward(self, x, edge_index):
        # Pass node features through graph convolution layers
        for conv_layer in self.convs:
            # Ensure inputs to convolution are on the correct device (handled by loader batch.to(device))
            x = conv_layer(x, edge_index)
            x = F.relu(x) # Apply ReLU activation after each layer
        return x

    # Predict method using dot product between node embeddings
    # This method now expects edge indices that are LOCAL to the provided embeddings `z`
    def predict(self, z, edge_index_pos, edge_index_neg):
        # Calculate scores for positive links
        if edge_index_pos.numel() > 0:
            # edge_index_pos should already be on the correct device and local to z
            row_pos, col_pos = edge_index_pos
            # Dot product between embeddings of connected nodes
            pos_out = (z[row_pos] * z[col_pos]).sum(dim=-1)
        else:
            pos_out = torch.empty(0).to(z.device) # Handle case with no positive edges

        # Calculate scores for negative links
        if edge_index_neg.numel() > 0:
            # edge_index_neg should already be on the correct device and local to z
            row_neg, col_neg = edge_index_neg
            # Dot product between embeddings of disconnected nodes
            neg_out = (z[row_neg] * z[col_neg]).sum(dim=-1)
        else:
            neg_out = torch.empty(0).to(z.device) # Handle case with no negative edges

        # Apply sigmoid to get probabilities
        return torch.sigmoid(pos_out), torch.sigmoid(neg_out)

# Helper function to create labels for positive and negative edges
def get_link_labels(pos_edge_index, neg_edge_index):
    E_pos = pos_edge_index.size(1) # Number of positive edges
    E_neg = neg_edge_index.size(1) # Number of negative edges
    # Determine device based on input tensors
    device = pos_edge_index.device if pos_edge_index.numel() > 0 else \
             (neg_edge_index.device if neg_edge_index.numel() > 0 else 'cpu')
    # Create labels: 1 for positive, 0 for negative
    labels = torch.cat([
        torch.ones(E_pos, device=device),
        torch.zeros(E_neg, device=device)
    ], dim=0).float()
    return labels

# Training function adapted for LinkNeighborLoader
def train(model, loader, optimizer, device):
    model.train() # Set model to training mode
    total_loss = 0
    total_edges = 0 # To track the number of edges processed for averaging loss

    # Wrap the loader with tqdm for a progress bar
    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad() # Clear gradients for this batch

        # Move batch data to the correct device
        batch = batch.to(device)

        # Compute node embeddings using the message-passing edges in the batch
        # batch.x and batch.edge_index are already on the device
        z = model(batch.x, batch.edge_index)

        # LinkNeighborLoader provides the prediction edges directly in batch.edge_label_index
        # and their labels in batch.edge_label.
        # These indices should be LOCAL to the nodes in the batch's subgraph.

        train_edge_label_index = batch.edge_label_index
        train_edge_label = batch.edge_label

        # Filter local edge_label_index to get positive and negative edges based on edge_label
        pos_mask = train_edge_label == 1
        neg_mask = train_edge_label == 0

        pos_edge_index_local = train_edge_label_index[:, pos_mask]
        neg_edge_index_local = train_edge_label_index[:, neg_mask]

        # Handle cases where a batch might result in no positive or negative edges
        if pos_edge_index_local.numel() == 0 and neg_edge_index_local.numel() == 0:
            # print("Warning: No positive or negative training edges found in batch. Skipping batch.")
            continue # Skip this batch if no relevant edges

        # Get predictions for positive and negative links using LOCAL indices
        # The predict method now receives indices local to the batch's z tensor
        pos_out, neg_out = model.predict(z, pos_edge_index_local, neg_edge_index_local)
        # Get corresponding labels
        labels = get_link_labels(pos_edge_index_local, neg_edge_index_local) # get_link_labels handles device

        # Concatenate predictions for loss calculation
        predictions = torch.cat([pos_out, neg_out], dim=0)

        # Check if predictions or labels are empty before calculating loss
        if predictions.numel() == 0 or labels.numel() == 0:
            # print("Warning: No training predictions or labels in batch. Skipping batch.")
            continue # Skip this batch if no predictions or labels

        # Calculate Binary Cross-Entropy loss
        loss = F.binary_cross_entropy(predictions, labels)

        loss.backward() # Backpropagate the loss
        optimizer.step() # Update model parameters

        total_loss += loss.item() * predictions.size(0) # Accumulate loss (scaled by number of predictions in batch)
        total_edges += predictions.size(0) # Accumulate the number of edges processed

    # Return average loss over all processed edges
    if total_edges > 0:
        return total_loss / total_edges
    else:
        return torch.tensor(0.0, device=device) # Return 0 if no edges were processed


# Evaluation function for classification metrics (AUC, F1, MAP)
@torch.no_grad() # Disable gradient calculation for evaluation
def evaluate_classification_metrics(model, loader, device):
    model.eval() # Set model to evaluation mode
    y_true = []
    y_pred = []

    # Wrap the loader with tqdm for a progress bar
    for batch in tqdm(loader, desc="Evaluating Classification Metrics"):
        # Move batch data to the correct device
        batch = batch.to(device)

        # Compute node embeddings using the message-passing edges in the batch
        z = model(batch.x, batch.edge_index)

        # LinkNeighborLoader provides the prediction edges directly in batch.edge_label_index
        # and their labels in batch.edge_label.
        # These indices should be LOCAL to the nodes in the batch's subgraph.

        eval_edge_label_index = batch.edge_label_index
        eval_edge_label = batch.edge_label

        # Filter local edge_label_index to get positive and negative edges based on edge_label
        pos_mask = eval_edge_label == 1
        neg_mask = eval_edge_label == 0

        pos_edge_index_local = eval_edge_label_index[:, pos_mask]
        neg_edge_index_local = eval_edge_label_index[:, neg_mask]

        # Handle cases where a batch might result in no positive or negative edges
        if pos_edge_index_local.numel() == 0 and neg_edge_index_local.numel() == 0:
            continue # Skip this batch if no relevant edges

        # Get predictions for positive and negative links using LOCAL indices
        pos_out, neg_out = model.predict(z, pos_edge_index_local, neg_edge_index_local)
        # Get corresponding labels (using the filtered labels)
        labels = get_link_labels(pos_edge_index_local, neg_edge_index_local) # get_link_labels handles device

        # Concatenate predictions for AUC calculation
        predictions = torch.cat([pos_out, neg_out], dim=0)

        # Check if predictions or labels are empty before processing
        if predictions.numel() == 0 or labels.numel() == 0:
            continue # Skip this batch

        # Append to lists (move to CPU for scikit-learn)
        y_pred.append(predictions.cpu())
        y_true.append(labels.cpu())

    # Concatenate all batch results
    if len(y_true) == 0 or len(y_pred) == 0:
         print("Warning: No evaluation batches processed for classification metrics. Metrics are not defined.")
         return {
             'roc_auc': 0.0,
             'f1': 0.0,
             'map': 0.0, # MAP calculated on flat list
         }

    y_pred = torch.cat(y_pred, dim=0).numpy()
    y_true = torch.cat(y_true, dim=0).numpy()

    # --- Calculate Classification Metrics ---
    metrics = {}

    # ROC AUC
    if len(set(y_true)) < 2:
        print(f"Warning: Only one class present in y_true ({set(y_true)}). ROC AUC, F1, and MAP are not defined.")
        metrics['roc_auc'] = 0.0
        metrics['f1'] = 0.0
        metrics['map'] = 0.0
    else:
        metrics['roc_auc'] = roc_auc_score(y_true, y_pred)

        # F1 Score (requires a threshold, using 0.5)
        y_pred_binary = (y_pred > 0.5).astype(int)
        metrics['f1'] = f1_score(y_true, y_pred_binary)

        # Mean Average Precision (MAP) - calculated on the flat list of predictions
        metrics['map'] = average_precision_score(y_true, y_pred)

    return metrics

In [6]:

# Evaluation function for ranking metrics (NDCG, MAP@k)
@torch.no_grad()
def evaluate_ranking_metrics(model, data_split, device, num_neg_samples_per_pos=100):
    model.eval()
    # We need the full data object to sample negative edges and get node embeddings
    # for ranking evaluation.
    full_data = data_split # data_split contains the full graph structure for sampling

    # Get positive edges from the evaluation split
    eval_edge_label_index = data_split.edge_label_index
    eval_edge_label = data_split.edge_label
    pos_eval_edge_index_global = eval_edge_label_index[:, eval_edge_label == 1]

    if pos_eval_edge_index_global.numel() == 0:
        print("Warning: No positive edges found for ranking evaluation. NDCG and MAP are not defined.")
        return {'ndcg': 0.0, 'map_at_k': 0.0}

    all_ndcg_scores = []
    all_ap_scores = [] # Average Precision for MAP

    # Create a LinkNeighborLoader specifically for the positive evaluation edges
    # This will sample subgraphs for each positive edge and its sampled negatives,
    # allowing us to compute embeddings in batches.
    # We set batch_size to a larger value for efficiency.
    ranking_batch_size = 2**10 # Increased batch size for ranking evaluation loader

    ranking_loader = LinkNeighborLoader(
        full_data, # Use the full data object for sampling
        num_neighbors=[model.num_layers] * 2, # Sample enough neighbors for the model's receptive field
        batch_size=ranking_batch_size, # Use a larger batch size
        edge_label_index=pos_eval_edge_index_global, # Only provide the positive evaluation edges
        edge_label=torch.ones(pos_eval_edge_index_global.size(1), dtype=torch.float), # Labels are all 1 for positive edges
        neg_sampling_ratio=num_neg_samples_per_pos, # Sample negatives for each positive edge
        shuffle=False, # Order doesn't matter for evaluation
        num_workers=0, # Use 0 workers for simplicity in this specific ranking loop
    )

    # Iterate through batches from the ranking loader
    for batch in tqdm(ranking_loader, desc="Evaluating Ranking Metrics"):
        # Move batch data to the correct device
        batch = batch.to(device)

        # Compute node embeddings for the nodes in this batch's subgraph
        z = model(batch.x, batch.edge_index)

        # The batch now contains multiple positive edges and their sampled negatives
        ranking_edge_label_index_local = batch.edge_label_index
        ranking_edge_label = batch.edge_label # Contains 1s for positive, 0s for negatives

        # Ensure we have both positive and negative samples in this batch
        if (batch.edge_label == 1).sum() == 0 or (batch.edge_label == 0).sum() == 0:
             # print("Warning: Batch does not contain both positive and negative samples for ranking. Skipping.")
             continue

        # Get prediction scores for the edges in this batch
        # batch.edge_label_index contains the local indices of the prediction edges
        pos_mask_in_batch = batch.edge_label == 1
        neg_mask_in_batch = batch.edge_label == 0

        pos_edge_index_local = ranking_edge_label_index_local[:, pos_mask_in_batch]
        neg_edge_index_local = ranking_edge_label_index_local[:, neg_mask_in_batch]

        pos_out, neg_out = model.predict(z, pos_edge_index_local, neg_edge_index_local)

        # Combine positive and negative scores and labels for ranking
        ranking_scores = torch.cat([pos_out, neg_out], dim=0).cpu().numpy()
        # Use batch.edge_label directly for labels
        ranking_labels = batch.edge_label.cpu().numpy()

        # Calculate NDCG and Average Precision (for MAP) for this ranking
        # NDCG@k requires specifying k. Let's use k = num_neg_samples_per_pos + 1 (positive + negatives)
        k = num_neg_samples_per_pos + 1 # k is based on the number of sampled negatives per positive edge + the positive edge itself
        # Ensure there are enough samples for NDCG@k calculation
        if len(ranking_labels) >= k:
             # Reshape for ndcg_score: y_true needs to be shape (n_samples, 1) or (n_samples,)
             # y_score needs to be shape (n_samples,)
             # Note: ndcg_score expects relevance scores (y_true) and prediction scores (y_score).
             # Our labels are 1 for positive, 0 for negative, which are suitable as relevance scores.
             all_ndcg_scores.append(ndcg_score(ranking_labels.reshape(1, -1), ranking_scores.reshape(1, -1), k=k))
             # Average Precision (AP) for this positive edge
             all_ap_scores.append(average_precision_score(ranking_labels, ranking_scores))
        # else:
             # print(f"Warning: Not enough samples ({len(ranking_labels)}) for NDCG@{k}. Skipping.")


    # Calculate average NDCG and MAP over all processed positive edges
    avg_ndcg = np.mean(all_ndcg_scores) if all_ndcg_scores else 0.0
    avg_map = np.mean(all_ap_scores) if all_ap_scores else 0.0

    return {'ndcg': avg_ndcg, 'map_at_k': avg_map}


In [7]:
import torch
import numpy as np

In [8]:
# --- Example Usage with Larger Synthetic Data ---
# Convert NumPy arrays to PyTorch tensors

print("Loading author embeddings and edges...")
try:
    # Use mmap_mode='r' for potentially large files to avoid loading everything into RAM at once
    author_embedings_np = np.load('/kaggle/input/author_pred/pytorch/default/1/author_embeddings.npy', mmap_mode='r')
    edges_np = np.load('/kaggle/input/edges-dm/edges.npy', mmap_mode='r')
    print("Data loaded successfully.")
    print(f"Author embeddings shape: {author_embedings_np.shape}")
    print(f"Edges shape: {edges_np.shape}")
except FileNotFoundError:
    print("Error: Make sure 'author_embeddings.npy' and 'edges.npy' are in the correct directory.")


x_features = torch.tensor(author_embedings_np, dtype=torch.float)
# edges are typically (2, num_edges), representing source and target nodes
edge_index = torch.tensor(edges_np, dtype=torch.long).contiguous() # Transpose to get (2, num_edges)
print(x_features.shape,edge_index.shape)

del author_embedings_np
del edges_np

# Get the number of nodes. Assuming node IDs are 0-indexed up to the maximum ID in edges.
# A safer approach is to use the shape of the feature matrix if it's guaranteed to have
# features for all nodes 0 to num_nodes-1.
# If node IDs in edges might not cover all nodes in the embedding matrix,
# num_nodes should be derived from the embedding matrix shape.
num_nodes = x_features.size(0)
print(f"Number of nodes inferred from embeddings: {num_nodes}")

# Make the graph undirected by adding reverse edges
# Use coalesce to remove self-loops and duplicate edges after adding reverse edges
edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=-1)
edge_index = coalesce(edge_index, num_nodes=num_nodes)
print(f"Processed edge_index (undirected, no self-loops/duplicates) shape: {edge_index.shape}")

# Create the PyTorch Geometric Data object
# Keep data on CPU initially
data_obj = Data(x=x_features, edge_index=edge_index, num_nodes=num_nodes)
print("PyTorch Geometric Data object created.")

print("Data object created.")
print(f"Approximate RAM usage for data_obj: {(data_obj.x.element_size() * data_obj.x.numel() + data_obj.edge_index.element_size() * data_obj.edge_index.numel()) / (1024**3):.2f} GB")

Loading author embeddings and edges...
Data loaded successfully.
Author embeddings shape: (3244445, 559)
Edges shape: (2, 16784250)
torch.Size([3244445, 559]) torch.Size([2, 16784250])
Number of nodes inferred from embeddings: 3244445
Processed edge_index (undirected, no self-loops/duplicates) shape: torch.Size([2, 16111346])
PyTorch Geometric Data object created.
Data object created.
Approximate RAM usage for data_obj: 7.00 GB


In [9]:
# Configure RandomLinkSplit
# num_val=0.0 means no validation set
# num_test=0.4 means 40% of edges will be used for testing (both positive and sampled negative)
# is_undirected=True ensures edges are split symmetrically
# add_negative_train_samples=True ensures negative samples are added to the training split's edge_label_index
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True,
    add_negative_train_samples=True, # Ensure negative samples are generated for training
)

# Apply the transform to the data object
# The resulting splits will be on CPU initially
print("Applying RandomLinkSplit transform...")
train_data_split, val_data_split, test_data_split = transform(data_obj)
print("Transform applied. Split data is on CPU.")

Applying RandomLinkSplit transform...
Transform applied. Split data is on CPU.


In [10]:

# --- Create LinkNeighborLoaders for mini-batching ---

# Define the number of neighbors to sample for each layer
# This is a critical parameter for memory usage and performance.
# Adjust based on GPU memory. Higher values might increase GPU utilization
# but require more memory.
num_neighbors = [10, 10] # Sample 10 neighbors for each of the 2 layers

# Define the batch size for prediction edges.
# Larger batch size can increase GPU utilization but requires more GPU memory.
# Adjust based on GPU memory.
batch_size = 2**15

# Define the number of CPU workers for data loading.
# Increase this to speed up batch preparation and reduce GPU idle time,
# provided you have enough CPU cores and RAM.
num_workers = 0 # Increased num_workers

# Create the training LinkNeighborLoader
# edge_label_index and edge_label specify the prediction edges for which
# neighbors should be sampled (for both endpoints).
# shuffle=True shuffles the order of batches of prediction edges.
train_loader = LinkNeighborLoader(
    train_data_split,
    num_neighbors=num_neighbors,
    batch_size=batch_size,
    edge_label_index=train_data_split.edge_label_index, # Provide the edges for prediction
    edge_label=train_data_split.edge_label, # Provide the labels for prediction edges
    shuffle=True,
    num_workers=num_workers,
)

# Create the test LinkNeighborLoader
# For testing, use the test split's prediction edges.
# shuffle=False as order doesn't matter for evaluation.
val_loader = LinkNeighborLoader(
    val_data_split,
    num_neighbors=num_neighbors,
    batch_size=batch_size,
    edge_label_index=val_data_split.edge_label_index, # Provide the edges for prediction
    edge_label=val_data_split.edge_label, # Provide the labels for prediction edges
    shuffle=False,
    num_workers=num_workers,
)

test_loader = LinkNeighborLoader(
    test_data_split,
    num_neighbors=num_neighbors,
    batch_size=batch_size,
    edge_label_index=test_data_split.edge_label_index, # Provide the edges for prediction
    edge_label=test_data_split.edge_label, # Provide the labels for prediction edges
    shuffle=False,
    num_workers=num_workers,
)

print(f"Created LinkNeighborLoaders with {num_neighbors} neighbors sampled per layer.")
print(f"Train loader will generate batches based on {train_data_split.edge_label_index.size(1)} prediction edges.")
print(f"Val loader will generate batches based on {val_loader.edge_label_index.size(1)} prediction edges.")
print(f"Test loader will generate batches based on {test_loader.edge_label_index.size(1)} prediction edges.")

Created LinkNeighborLoaders with [10, 10] neighbors sampled per layer.
Train loader will generate batches based on 12889078 prediction edges.
Val loader will generate batches based on 1611134 prediction edges.
Test loader will generate batches based on 1611134 prediction edges.


In [11]:

# Hyperparameters for the model and training
in_channels = data_obj.num_node_features # Input dimension is the number of node features
hidden_channels = 32 # Dimension of hidden layers
out_channels = 1 # Output dimension for link prediction (a score)
num_layers = 2 # Number of graph convolution layers (matches num_neighbors length)
learning_rate = 0.01
epochs = 5 # Reduced epochs for quicker testing with large data

model_type = 'SAGE' # or 'SAGE'

# Initialize the model on CPU first
print(f"Initializing model ({model_type}) on CPU...")
model = LinkPredictor(in_channels, hidden_channels, out_channels, num_layers, model_type=model_type)
print("Model initialized on CPU.")

# Check for NaNs/Infs in model parameters (optional but good practice)
for name, param in model.named_parameters():
    if torch.isnan(param).any() or torch.isinf(param).any():
        print(f"Warning: Parameter {name} contains NaN or Inf values after initialization.")

# Move the model to the selected device
print(f"Moving model to {device}...")
model.to(device)
print("Model moved to device.")

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print("Optimizer initialized.")

Initializing model (SAGE) on CPU...
Model initialized on CPU.
Moving model to cuda...
Model moved to device.
Optimizer initialized.


In [12]:
num_neg_samples_for_ranking = 5 # Number of negative samples per positive edge for ranking

In [13]:
for epoch in range(epochs):
    # Train the model using the training loader
    loss = train(model, train_loader, optimizer, device)

    val_ranking_metrics = evaluate_ranking_metrics(model, val_data_split, device, num_neg_samples_for_ranking)

    # Evaluate classification metrics
    val_classification_metrics = evaluate_classification_metrics(model, val_loader, device)

    # Print progress with multiple metrics
    print(f'Epoch: {epoch+1:03d}, Loss: {loss:.4f}, '
          f'Val AUC: {val_classification_metrics["roc_auc"]:.4f}, Val F1: {val_classification_metrics["f1"]:.4f}, Val MAP (flat): {val_classification_metrics["map"]:.4f}, '
          f'Val NDCG@{num_neg_samples_for_ranking}: {val_ranking_metrics["ndcg"]:.4f}, Val MAP@{num_neg_samples_for_ranking}: {val_ranking_metrics["map_at_k"]:.4f}')
    if device.type == 'cuda':
            torch.cuda.empty_cache()

print("\nTraining finished.")

# Final evaluation on the test set after training
final_test_classification_metrics = evaluate_classification_metrics(model, test_loader, device)
final_test_ranking_metrics = evaluate_ranking_metrics(model, test_data_split, device, num_neg_samples_for_ranking)

print(f'Final Test AUC: {final_test_classification_metrics["roc_auc"]:.4f}, '
      f'Final Test F1: {final_test_classification_metrics["f1"]:.4f}, '
      f'Final Test MAP (flat): {final_test_classification_metrics["map"]:.4f}, '
      f'Final Test NDCG@{num_neg_samples_for_ranking}: {final_test_ranking_metrics["ndcg"]:.4f}, '
      f'Final Test MAP@{num_neg_samples_for_ranking}: {final_test_ranking_metrics["map_at_k"]:.4f}')
torch.save(model.state_dict(), f'model_{model_type}_checkpoint_final.pth')

Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 001, Loss: 0.6306, Val AUC: 0.7894, Val F1: 0.7474, Val MAP (flat): 0.7629, Val NDCG@5: 0.7994, Val MAP@5: 0.3230


Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 002, Loss: 0.5852, Val AUC: 0.7944, Val F1: 0.7617, Val MAP (flat): 0.7729, Val NDCG@5: 0.7804, Val MAP@5: 0.3127


Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 003, Loss: 0.5793, Val AUC: 0.7923, Val F1: 0.7643, Val MAP (flat): 0.7695, Val NDCG@5: 0.8172, Val MAP@5: 0.3085


Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 004, Loss: 0.5773, Val AUC: 0.8019, Val F1: 0.7660, Val MAP (flat): 0.7807, Val NDCG@5: 0.7987, Val MAP@5: 0.3108


Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 005, Loss: 0.5768, Val AUC: 0.7912, Val F1: 0.7635, Val MAP (flat): 0.7692, Val NDCG@5: 0.8126, Val MAP@5: 0.3007

Training finished.


Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Final Test AUC: 0.8039, Final Test F1: 0.7739, Final Test MAP (flat): 0.7823, Final Test NDCG@5: 0.8311, Final Test MAP@5: 0.3044


In [14]:
model.cpu()
torch.cuda.empty_cache()

In [15]:
model_type = 'GIN' # or 'SAGE'

# Initialize the model on CPU first
print(f"Initializing model ({model_type}) on CPU...")
model = LinkPredictor(in_channels, hidden_channels, out_channels, num_layers, model_type=model_type)
print("Model initialized on CPU.")

# Check for NaNs/Infs in model parameters (optional but good practice)
for name, param in model.named_parameters():
    if torch.isnan(param).any() or torch.isinf(param).any():
        print(f"Warning: Parameter {name} contains NaN or Inf values after initialization.")

# Move the model to the selected device
print(f"Moving model to {device}...")
model.to(device)
print("Model moved to device.")

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print("Optimizer initialized.")

Initializing model (GIN) on CPU...
Model initialized on CPU.
Moving model to cuda...
Model moved to device.
Optimizer initialized.


In [16]:
for epoch in range(epochs):
    # Train the model using the training loader
    loss = train(model, train_loader, optimizer, device)

    # Evaluate classification metrics
    val_classification_metrics = evaluate_classification_metrics(model, val_loader, device)

    # Evaluate ranking metrics (NDCG, MAP@k)
    # Note: This can be slow for large test sets as it iterates through positive edges.
    # Adjust num_neg_samples_per_pos or sample a subset of positive edges for faster evaluation.
    val_ranking_metrics = evaluate_ranking_metrics(model, val_data_split, device, num_neg_samples_for_ranking)


    # Print progress with multiple metrics
    print(f'Epoch: {epoch+1:03d}, Loss: {loss:.4f}, '
          f'Val AUC: {val_classification_metrics["roc_auc"]:.4f}, Val F1: {val_classification_metrics["f1"]:.4f}, Val MAP (flat): {val_classification_metrics["map"]:.4f}, '
          f'Val NDCG@{num_neg_samples_for_ranking}: {val_ranking_metrics["ndcg"]:.4f}, Val MAP@{num_neg_samples_for_ranking}: {val_ranking_metrics["map_at_k"]:.4f}')


    # Save model checkpoint (optional)
    # torch.save(model.state_dict(), f'model_checkpoint_epoh_{epoch}.pth')

print("\nTraining finished.")

# Final evaluation on the test set after training
final_test_classification_metrics = evaluate_classification_metrics(model, test_loader, device)
final_test_ranking_metrics = evaluate_ranking_metrics(model, test_data_split, device, num_neg_samples_for_ranking)

print(f'Final Test AUC: {final_test_classification_metrics["roc_auc"]:.4f}, '
      f'Final Test F1: {final_test_classification_metrics["f1"]:.4f}, '
      f'Final Test MAP (flat): {final_test_classification_metrics["map"]:.4f}, '
      f'Final Test NDCG@{num_neg_samples_for_ranking}: {final_test_ranking_metrics["ndcg"]:.4f}, '
      f'Final Test MAP@{num_neg_samples_for_ranking}: {final_test_ranking_metrics["map_at_k"]:.4f}')
torch.save(model.state_dict(), f'model_{model_type}_checkpoint_final.pth')

Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Epoch: 001, Loss: 0.5558, Val AUC: 0.8508, Val F1: 0.7941, Val MAP (flat): 0.8434, Val NDCG@5: 0.7177, Val MAP@5: 0.3378


Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Epoch: 002, Loss: 0.4867, Val AUC: 0.8660, Val F1: 0.8132, Val MAP (flat): 0.8596, Val NDCG@5: 0.6160, Val MAP@5: 0.3512


Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Epoch: 003, Loss: 0.4730, Val AUC: 0.8679, Val F1: 0.8193, Val MAP (flat): 0.8609, Val NDCG@5: 0.6019, Val MAP@5: 0.3539


Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Epoch: 004, Loss: 0.4676, Val AUC: 0.8742, Val F1: 0.8215, Val MAP (flat): 0.8680, Val NDCG@5: 0.7261, Val MAP@5: 0.3750


Training:   0%|          | 0/394 [00:00<?, ?it/s]

Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Epoch: 005, Loss: 0.4645, Val AUC: 0.8722, Val F1: 0.8263, Val MAP (flat): 0.8663, Val NDCG@5: 0.7102, Val MAP@5: 0.3736

Training finished.


Evaluating Classification Metrics:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating Ranking Metrics:   0%|          | 0/787 [00:00<?, ?it/s]

Final Test AUC: 0.8878, Final Test F1: 0.8390, Final Test MAP (flat): 0.8817, Final Test NDCG@5: 0.7258, Final Test MAP@5: 0.3770


In [17]:
model.cpu()
torch.cuda.empty_cache()