In [9]:
import numpy as np
import pandas as pd
import networkx as nx 

import torch
import torch.nn.functional as F
from torch_geometric.utils import from_networkx, train_test_split_edges
from torch_geometric.nn import GCNConv, VGAE

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve

import matplotlib.pyplot as plt

# Link Prediction

## Data Loading

In [13]:
G = nx.read_graphml('datasetAirports.graphml')

# Important: Clear the graph-level attributes to avoid bugs
G.graph = {}
print(f"Loaded graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

Loaded graph with 3363 nodes and 13547 edges


In [16]:
#Encoding des variables textuelles
country_encoder = LabelEncoder()
city_encoder = LabelEncoder()

countries = [data.get("country", "UNK") for _, data in G.nodes(data=True)]
cities = [data.get("city_name", "UNK") for _, data in G.nodes(data=True)]

# Fit les encoders
country_encoder.fit(countries)
city_encoder.fit(cities)

# Appliquer l'encodage
for _, data in G.nodes(data=True):
    data["country"] = country_encoder.transform([data.get("country", "UNK")])[0]
    data["city_name"] = city_encoder.transform([data.get("city_name", "UNK")])[0]

# On garde des dictoinaires pour faire le lien entre encoding et valeur de base
country_to_encoding = {label: int(country_encoder.transform([label])[0]) for label in country_encoder.classes_}
encoding_to_country = {v: k for k, v in country_to_encoding.items()}
city_to_encoding = {label: int(city_encoder.transform([label])[0]) for label in city_encoder.classes_}
encoding_to_city = {v: k for k, v in city_to_encoding.items()}

# Convert NetworkX graph to PyTorch Geometric Data object
data = from_networkx(G, group_node_attrs=["lon", "lat", "population", "country", "city_name"])

# Display the converted data
print(data)
print(type(data))
print(f"\nNode features shape (x): {data.x.shape}")
print(f"Edge index shape: {data.edge_index.shape}")
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")
print("Is directed?", nx.is_directed(G))  # False ✅
print(f"Number of node features: {data.num_node_features}")

# Check the first few node features
print(f"\nFirst 5 node feature vectors:\n{data.x[:5]}")

Data(edge_index=[2, 27094], x=[3363, 5])
<class 'torch_geometric.data.data.Data'>

Node features shape (x): torch.Size([3363, 5])
Edge index shape: torch.Size([2, 27094])
Number of nodes: 3363
Number of edges: 27094
Is directed? False
Number of node features: 5

First 5 node feature vectors:
tensor([[-1.4551e+02, -1.7354e+01,  1.0000e+04,  6.6000e+01,  9.9000e+01],
        [-1.4095e+02, -1.8067e+01,  1.0000e+04,  6.6000e+01,  1.0710e+03],
        [-1.4960e+02, -1.7550e+01,  2.6357e+04,  6.6000e+01,  2.2070e+03],
        [-1.3500e+02, -2.3033e+01,  1.0000e+04,  6.6000e+01,  9.2000e+02],
        [-1.4366e+02, -1.6585e+01,  1.0000e+04,  6.6000e+01,  1.7050e+03]])


## Data Visualization

In [18]:
# Count airports per country
country_counts = data['country'].value_counts()

# Plot top 20 countries with the most airports
top_countries = country_counts.head(20)

plt.figure(figsize=(12, 6))
top_countries.plot(kind='bar', color='coral', edgecolor='black')
plt.title("Top 20 Countries by Number of Airports")
plt.xlabel("Country")
plt.ylabel("Number of Airports")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

KeyError: 'country'

## Data Preparation before training

In [7]:
data = train_test_split_edges(data, val_ratio=0.1, test_ratio=0.1)

print(f"Training edges: {data.train_pos_edge_index.size(1)}")
print(f"Test positive edges: {data.test_pos_edge_index.size(1)}")
print(f"Test negative edges: {data.test_neg_edge_index.size(1)}")


EDGE SPLIT SUMMARY

Training edges:
  - train_pos_edge_index shape: torch.Size([2, 21676])
  - Number of training edges: 21676

Test edges (positive - actual edges to predict):
  - test_pos_edge_index shape: torch.Size([2, 2709])
  - Number of test positive edges: 2709

Test edges (negative - non-existent edges):
  - test_neg_edge_index shape: torch.Size([2, 2709])
  - Number of test negative edges: 2709

Node features (unchanged):
  - x shape: torch.Size([3363, 5])
  - All node features preserved: 3363 nodes, 5 features

VERIFICATION
Original edges (directed): 27094
Training edges: 21676 (80.0%)
Test positive edges: 2709 (10.0%)
Test negative edges: 2709

SAMPLE EDGES

Sample training edges (first 5):
  Edge 1: Node 0 <-> Node 1
  Edge 2: Node 0 <-> Node 2
  Edge 3: Node 1 <-> Node 0
  Edge 4: Node 1 <-> Node 2
  Edge 5: Node 1 <-> Node 4

Sample test positive edges (first 5):
  Edge 1: Node 1082 <-> Node 2055 (should exist)
  Edge 2: Node 118 <-> Node 665 (should exist)
  Edge 3: No



## Definition & Initialisation of encoder class

In [8]:
# Encoder class for VGAE
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)

in_channels = data.num_features
out_channels = 16

encoder = Encoder(in_channels, out_channels)
model = VGAE(encoder)

print("="*70)
print("VGAE MODEL ARCHITECTURE")
print("="*70)
print(f"\nEncoder:")
print(f"  Input features: {in_channels}")
print(f"  Hidden dimension: {2 * out_channels}")
print(f"  Latent dimension: {out_channels}")
print(f"\nModel structure:")
print(model)

# Check the encoder output dimensions
print(f"\n" + "="*70)
print("ENCODER OUTPUT DIMENSIONS")
print("="*70)
with torch.no_grad():
    mu, logstd = encoder(data.x, data.train_pos_edge_index)
    print(f"Mean (mu) shape: {mu.shape} - [{data.num_nodes} nodes × {out_channels} latent dims]")
    print(f"Log std (logstd) shape: {logstd.shape} - [{data.num_nodes} nodes × {out_channels} latent dims]")


VGAE MODEL ARCHITECTURE

Encoder:
  Input features: 5
  Hidden dimension: 32
  Latent dimension: 16

Model structure:
VGAE(
  (encoder): Encoder(
    (conv1): GCNConv(5, 32)
    (conv_mu): GCNConv(32, 16)
    (conv_logstd): GCNConv(32, 16)
  )
  (decoder): InnerProductDecoder()
)

ENCODER OUTPUT DIMENSIONS
Mean (mu) shape: torch.Size([3363, 16]) - [3363 nodes × 16 latent dims]
Log std (logstd) shape: torch.Size([3363, 16]) - [3363 nodes × 16 latent dims]


## Training - A FAIRE G JUSTE COPIÉ COLLER LE TP SUR CETTE PARTIE

In [9]:
# Ensure model and data are on the correct device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

print("="*70)
print("VGAE TRAINING AND EVALUATION")
print("="*70)

# Training function - uses ONLY training edges
def train():
    model.train()
    optimizer.zero_grad()
    
    # IMPORTANT: Encode using ONLY training edges
    z = model.encode(data_edge.x, data_edge.train_pos_edge_index)
    
    # Compute reconstruction loss on training edges only
    loss = model.recon_loss(z, data_edge.train_pos_edge_index)
    
    # Add KL divergence regularization
    loss = loss + (1 / data_edge.num_nodes) * model.kl_loss()
    
    loss.backward()
    optimizer.step()
    
    return float(loss)

# Testing function - uses training edges for encoding, test edges for evaluation
def test(pos_edge_index, neg_edge_index):
    model.eval()
    
    with torch.no_grad():
        # IMPORTANT: Encode using ONLY training edges (not test edges!)
        # This ensures we don't leak information from test set
        z = model.encode(data_edge.x, data_edge.train_pos_edge_index)
        
        # Evaluate on test edges
        auc, ap = model.test(z, pos_edge_index, neg_edge_index)
    
    return auc, ap

# Training loop
print("\nTraining Progress:")
print(f"{'Epoch':<8} {'Loss':<12} {'Test AUC':<12} {'Test AP':<12}")
print("-"*70)

num_epochs = 200
best_auc = 0

for epoch in range(1, num_epochs + 1):
    loss = train()
    
    if epoch % 20 == 0:
        auc, ap = test(data_edge.test_pos_edge_index, data_edge.test_neg_edge_index)
        print(f"{epoch:<8} {loss:<12.4f} {auc:<12.4f} {ap:<12.4f}")
        
        if auc > best_auc:
            best_auc = auc

print("\n" + "="*70)
print("FINAL TEST EVALUATION")
print("="*70)

# Final comprehensive evaluation
model.eval()

with torch.no_grad():
    # Encode using TRAINING edges only
    z = model.encode(data_edge.x, data_edge.train_pos_edge_index)
    
    # Evaluate on TEST edges
    test_auc, test_ap = model.test(z, data_edge.test_pos_edge_index, 
                                     data_edge.test_neg_edge_index)
    
    # Get predictions for positive test edges
    pos_pred = model.decoder(z, data_edge.test_pos_edge_index, sigmoid=True)
    
    # Get predictions for negative test edges  
    neg_pred = model.decoder(z, data_edge.test_neg_edge_index, sigmoid=True)

print(f"\nTest Set Metrics:")
print(f"  AUC: {test_auc:.4f}")
print(f"  AP:  {test_ap:.4f}")

print(f"\nPrediction Statistics on Test Set:")
print(f"  Positive edges (should exist):")
print(f"    Mean: {pos_pred.mean().item():.4f}")
print(f"    Std:  {pos_pred.std().item():.4f}")
print(f"  Negative edges (should NOT exist):")
print(f"    Mean: {neg_pred.mean().item():.4f}")
print(f"    Std:  {neg_pred.std().item():.4f}")

# Detailed analysis
print("\n" + "="*70)
print("DETAILED TEST SET ANALYSIS")
print("="*70)

# Combine predictions and labels
all_preds = torch.cat([pos_pred, neg_pred]).cpu().numpy()
all_labels = torch.cat([
    torch.ones(pos_pred.shape[0]),
    torch.zeros(neg_pred.shape[0])
]).numpy()

# Calculate additional metrics
from sklearn.metrics import precision_score, recall_score, f1_score

threshold = 0.5
binary_preds = (all_preds > threshold).astype(int)

precision = precision_score(all_labels, binary_preds)
recall = recall_score(all_labels, binary_preds)
f1 = f1_score(all_labels, binary_preds)

print(f"\nWith threshold = {threshold}:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}")

# Count correct predictions
correct_pos = (pos_pred > threshold).sum().item()
correct_neg = (neg_pred < threshold).sum().item()
total_correct = correct_pos + correct_neg
total_test = len(pos_pred) + len(neg_pred)

print(f"\nAccuracy breakdown:")
print(f"  Positive edges correctly predicted: {correct_pos}/{len(pos_pred)} ({correct_pos/len(pos_pred)*100:.1f}%)")
print(f"  Negative edges correctly predicted: {correct_neg}/{len(neg_pred)} ({correct_neg/len(neg_pred)*100:.1f}%)")
print(f"  Overall accuracy: {total_correct}/{total_test} ({total_correct/total_test*100:.1f}%)")

# Show sample predictions
print("\n" + "="*70)
print("SAMPLE TEST PREDICTIONS")
print("="*70)

print("\nPositive test edges (first 10):")
print(f"{'Edge':<15} {'Probability':<12} {'Prediction':<12}")
print("-"*40)
for i in range(min(10, len(pos_pred))):
    src = data_edge.test_pos_edge_index[0, i].item()
    dst = data_edge.test_pos_edge_index[1, i].item()
    prob = pos_pred[i].item()
    pred_label = "EXISTS ✓" if prob > threshold else "NO EDGE ✗"
    print(f"{src:2d} <-> {dst:2d}      {prob:<12.4f} {pred_label}")

print("\nNegative test edges (first 10):")
print(f"{'Edge':<15} {'Probability':<12} {'Prediction':<12}")
print("-"*40)
for i in range(min(10, len(neg_pred))):
    src = data_edge.test_neg_edge_index[0, i].item()
    dst = data_edge.test_neg_edge_index[1, i].item()
    prob = neg_pred[i].item()
    pred_label = "NO EDGE ✓" if prob < threshold else "EXISTS ✗"
    print(f"{src:2d} <-> {dst:2d}      {prob:<12.4f} {pred_label}")

print("\n" + "="*70)
print("KEY POINTS ABOUT TRAIN/TEST SEPARATION")
print("="*70)
print("""
✓ CORRECT: During training, encode with train_pos_edge_index
✓ CORRECT: During testing, encode with train_pos_edge_index
✓ CORRECT: Evaluate predictions on test_pos_edge_index and test_neg_edge_index

✗ WRONG: Using test edges during encoding (information leakage!)
✗ WRONG: Evaluating on training edges (overly optimistic results)

The model learns node embeddings from training edges only.
Test edges are completely hidden during training and encoding.
This ensures fair evaluation of link prediction performance.
""")


VGAE TRAINING AND EVALUATION

Training Progress:
Epoch    Loss         Test AUC     Test AP     
----------------------------------------------------------------------


NameError: name 'data_edge' is not defined