# GraphSage Implementation

This notebook implements GraphSage (Graph Sample and Aggregate) for node classification tasks using the Pubmed citation network dataset.

## Setup and Data Loading

Import necessary libraries and load the Pubmed citation network dataset.

In [None]:
# Core libraries
import torch
import torch.nn.functional as F
from torch.nn import Linear, Dropout

# PyTorch Geometric for graph neural networks
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import NeighborLoader
from torch_geometric.utils import to_networkx

# Visualization and utilities
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
from matplotlib.patches import Rectangle

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set matplotlib style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

In [None]:
# Load Pubmed citation network dataset
dataset = Planetoid(root='/tmp/Pubmed', name='Pubmed')
data = dataset[0]

# Display basic dataset information
print(f'Dataset: {dataset}')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Training nodes: {data.train_mask.sum()}')
print(f'Validation nodes: {data.val_mask.sum()}')
print(f'Test nodes: {data.test_mask.sum()}')

## GraphSage Model Implementation

Implement the GraphSage architecture with neighbor sampling and aggregation.

In [None]:
class GraphSageModel(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes, num_layers=2, dropout=0.2):
        super(GraphSageModel, self).__init__()
        
        self.num_layers = num_layers
        self.dropout = dropout
        
        # GraphSage layers
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(num_features, hidden_dim))
        
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_dim, hidden_dim))
        
        self.convs.append(SAGEConv(hidden_dim, num_classes))
        
        # Dropout layer
        self.dropout_layer = Dropout(dropout)
    
    def forward(self, x, edge_index):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, edge_index)
            x = F.relu(x)
            x = self.dropout_layer(x)
        
        # Final layer without activation
        x = self.convs[-1](x, edge_index)
        return x

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSageModel(
    num_features=dataset.num_features,
    hidden_dim=128,
    num_classes=dataset.num_classes,
    num_layers=2,
    dropout=0.2
).to(device)

data = data.to(device)
print(model)

## Training Loop

Train the GraphSage model using mini-batch training with neighbor sampling.

In [None]:
# Create neighbor loader for mini-batch training
train_loader = NeighborLoader(
    data,
    num_neighbors=[5, 10],  # Number of neighbors to sample for each layer
    batch_size=16,
    input_nodes=data.train_mask,
    shuffle=True
)

# Setup optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train_epoch():
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        # Forward pass
        out = model(batch.x, batch.edge_index)
        
        # Only use training nodes for loss calculation
        loss = criterion(out[:batch.batch_size], batch.y[:batch.batch_size])
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

In [None]:
def evaluate(mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        correct = (pred[mask] == data.y[mask]).sum().item()
        accuracy = correct / mask.sum().item()
    return accuracy

# Training loop
train_losses = []
val_accuracies = []

for epoch in range(100):
    loss = train_epoch()
    train_acc = evaluate(data.train_mask)
    val_acc = evaluate(data.val_mask)
    
    train_losses.append(loss)
    val_accuracies.append(val_acc)
    
    if epoch % 10 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

# Final test accuracy
test_acc = evaluate(data.test_mask)
print(f'\nFinal Test Accuracy: {test_acc:.4f}')

## Results Visualization

Visualize training progress and model performance.

In [None]:
# Plot training progress
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Training loss
ax1.plot(train_losses, 'b-', label='Training Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss Over Time')
ax1.legend()
ax1.grid(True)

# Validation accuracy
ax2.plot(val_accuracies, 'r-', label='Validation Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Validation Accuracy Over Time')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()