In [2]:
import numpy 
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import remove_self_loops, add_self_loops, add_remaining_self_loops, degree


In [3]:
%run graph.ipynb

In [4]:
import numpy as np
import scipy.sparse as sp


def create_graph_pyg(master_df):
    # Prepare node features
    node_features = master_df.drop(columns=['priogrid_gid']).to_numpy(dtype=float)
    node_features_tensor = torch.tensor(node_features, dtype=torch.float)

    # Prepare nodes and their indices
    nodes = master_df['priogrid_gid'].tolist()
    node_index = {node: idx for idx, node in enumerate(nodes)}

    # Prepare edge list
    edge_list = []
    for i, node in enumerate(nodes):
        # Determine neighbors based on the index; adjust for boundary conditions
        neighbors = nodes[max(0, i-4):i] + nodes[i+1:i+5]
        for neighbor in neighbors:
            if neighbor in node_index:
                # Ensure neighbor exists
                edge_list.append([node_index[node], node_index[neighbor]])

    # Convert edge list to tensor
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

    # Create PyTorch Geometric Data object
    data = Data(x=node_features_tensor, edge_index=edge_index)

    # Normalize the adjacency matrix
    edge_index, _ = remove_self_loops(data.edge_index)
    edge_index, _ = add_remaining_self_loops(edge_index, num_nodes=data.num_nodes)
    row, col = edge_index
    deg = degree(row, data.num_nodes, dtype=data.x.dtype)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
    norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
    data.norm = norm

    return data

data = create_graph_pyg(master_df)


In [5]:
print(data)

Data(x=[5139120, 107], edge_index=[2, 41112940], norm=[46252060])


In [7]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import DataLoader
from torch_geometric.utils import degree, train_test_split_edges
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.norm

        # Add self-connections
        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_remaining_self_loops(edge_index, num_nodes=x.size(0))

        # Propagate through the GCN layers
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_weight)

        return x

# Split the data into train, validation, and test sets
train_data, val_data = train_test_split_edges(data)
val_data, test_data = train_test_split_edges(val_data, rev_examples=0.2)

# Create data loaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

# Initialize the model
model = GCN(input_dim=data.num_node_features, hidden_dim=64, output_dim=2)

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for data in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Evaluate on the validation set
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for data in val_loader:
            output = model(data)
            val_loss += criterion(output, data.y).item()
            val_preds.extend(output.argmax(dim=1).tolist())
            val_labels.extend(data.y.tolist())

    # Calculate validation metrics
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    val_auc_roc = roc_auc_score(val_labels, val_preds)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')
    print(f'Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}, Val AUC-ROC: {val_auc_roc:.4f}')

# Evaluate on the test set
model.eval()
test_preds = []
test_labels = []
with torch.no_grad():
    for data in test_loader:
        output = model(data)
        test_preds.extend(output.argmax(dim=1).tolist())
        test_labels.extend(data.y.tolist())

# Calculate test metrics
test_accuracy = accuracy_score(test_labels, test_preds)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='binary')
test_auc_roc = roc_auc_score(test_labels, test_preds)

print(f'\nTest Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}, Test AUC-ROC: {test_auc_roc:.4f}')



: 