In [2]:
import numpy 
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import remove_self_loops, add_self_loops, add_remaining_self_loops, degree


In [3]:
%run graph.ipynb

Data(x=[734160, 106], edge_index=[2, 5873260], y=[734160], norm=[6607420])
tensor(indices=tensor([[721050, 721050, 721050,  ..., 734157, 734158, 734159],
                       [721051, 721052, 721053,  ..., 734157, 734158, 734159]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(734160, 734160), nnz=6607420, layout=torch.sparse_coo)


In [4]:
import numpy as np
import scipy.sparse as sp


def create_graph_pyg(master_df):
    # Prepare node features
    node_features = master_df.drop(columns=['priogrid_gid']).to_numpy(dtype=float)
    node_features_tensor = torch.tensor(node_features, dtype=torch.float)

    # Prepare nodes and their indices
    nodes = master_df['priogrid_gid'].tolist()
    node_index = {node: idx for idx, node in enumerate(nodes)}

    # Prepare edge list
    edge_list = []
    for i, node in enumerate(nodes):
        # Determine neighbors based on the index; adjust for boundary conditions
        neighbors = nodes[max(0, i-4):i] + nodes[i+1:i+5]
        for neighbor in neighbors:
            if neighbor in node_index:
                # Ensure neighbor exists
                edge_list.append([node_index[node], node_index[neighbor]])

    # Convert edge list to tensor
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

    # Create PyTorch Geometric Data object
    data = Data(x=node_features_tensor, edge_index=edge_index)

    # Normalize the adjacency matrix
    edge_index, _ = remove_self_loops(data.edge_index)
    edge_index, _ = add_remaining_self_loops(edge_index, num_nodes=data.num_nodes)
    row, col = edge_index
    deg = degree(row, data.num_nodes, dtype=data.x.dtype)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
    norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
    data.norm = norm

    return data

data = create_graph_pyg(master_df)


In [5]:
print(data)

Data(x=[5139120, 107], edge_index=[2, 41112940], norm=[46252060])


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

# Graph Convolution Layer
class GraphConvolution(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GraphConvolution, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.weight = nn.Parameter(torch.Tensor(input_dim, output_dim))
        self.bias = nn.Parameter(torch.Tensor(output_dim))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.weight)
        nn.init.zeros_(self.bias)

    def forward(self, x, adj):
        support = torch.matmul(x, self.weight)
        output = torch.sparse.mm(adj, support)
        output = output + self.bias
        return F.relu(output)

# GCN Model
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(input_dim, hidden_dim)
        self.gc2 = GraphConvolution(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x, adj):
        x = self.gc1(x, adj)
        x = self.dropout(x)
        x = self.gc2(x, adj)
        return x

# Build and train GCN model
def train_gcn_model(model, features, adj_matrix, labels, epochs=20, lr=0.01):
    optimizer = Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(features, adj_matrix)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

# Assuming placeholders for features, adjacency matrix, and labels
num_nodes = 5139120
num_features = 107
num_classes = 10

# Generate random features and labels for demonstration
features_placeholder = torch.rand(num_nodes, num_features)
labels_placeholder = torch.randint(0, num_classes, (num_nodes,))

# Assuming you already have the adjacency matrix in the required format


# Create GCN model
gcn_model = GCN(input_dim=num_features, hidden_dim=16, output_dim=num_classes)

# Train GCN model
train_gcn_model(gcn_model, features_placeholder, adj_matrix, labels_placeholder, epochs=20)

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import DataLoader
from torch_geometric.utils import degree, train_test_split_edges, remove_self_loops, add_remaining_self_loops
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np
import scipy.sparse as sp

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.norm

        # Add self-connections
        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_remaining_self_loops(edge_index, num_nodes=x.size(0))

        # Propagate through the GCN layers
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_weight)

        return x

# Split the data into train, validation, and test sets
train_data, val_data = train_test_split_edges(data)
val_data, test_data = train_test_split_edges(val_data, rev_examples=0.2)

# Create data loaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

# Initialize the model
model = GCN(input_dim=data.num_node_features, hidden_dim=64, output_dim=2)

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in train_loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch.y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Evaluate on the validation set
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            output = model(batch)
            val_loss += criterion(output, batch.y).item()
            val_preds.extend(output.argmax(dim=1).tolist())
            val_labels.extend(batch.y.tolist())

    # Calculate validation metrics
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    val_auc_roc = roc_auc_score(val_labels, val_preds)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')
    print(f'Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}, Val AUC-ROC: {val_auc_roc:.4f}')

# Evaluate on the test set
model.eval()
test_preds = []
test_labels = []
with torch.no_grad():
    for batch in test_loader:
        output = model(batch)
        test_preds.extend(output.argmax(dim=1).tolist())
        test_labels.extend(batch.y.tolist())

# Calculate test metrics
test_accuracy = accuracy_score(test_labels, test_preds)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='binary')
test_auc_roc = roc_auc_score(test_labels, test_preds)

print(f'\nTest Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}, Test AUC-ROC: {test_auc_roc:.4f}')


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch_geometric.data import Data

# Graph Convolution Layer
class GraphConvolution(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GraphConvolution, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.weight = nn.Parameter(torch.Tensor(input_dim, output_dim))
        self.bias = nn.Parameter(torch.Tensor(output_dim))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.weight)
        nn.init.zeros_(self.bias)

    def forward(self, x, adj):
        support = torch.matmul(x, self.weight)
        output = torch.sparse.mm(adj, support)
        output = output + self.bias
        return F.relu(output)

# GCN Model
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(input_dim, hidden_dim)
        self.gc2 = GraphConvolution(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x, adj):
        x = self.gc1(x, adj)
        x = self.dropout(x)
        x = self.gc2(x, adj)
        # print(x.shape)
        return x


In [8]:
import numpy as np
import torch
from torch_geometric.data import Data
from torch_geometric.utils import remove_self_loops, add_remaining_self_loops, degree

def create_graph_pyg_usable(master_df):
    # Prepare node features
    node_features = master_df.drop(columns=['priogrid_gid', 'ged_sb']).to_numpy(dtype=float)
    node_features_tensor = torch.tensor(node_features, dtype=torch.float)

    # Extract labels for regression
    labels = master_df['ged_sb'].to_numpy(dtype=float)  # Assuming 'ged_sb' is the regression target
    labels_tensor = torch.tensor(labels, dtype=torch.float)

    # Prepare nodes and their indices
    nodes = master_df['priogrid_gid'].tolist()
    node_index = {node: idx for idx, node in enumerate(nodes)}

    # Prepare edge list
    edge_list = []
    for i, node in enumerate(nodes):
        # Determine neighbors based on the index; adjust for boundary conditions
        neighbors = nodes[max(0, i-4):i] + nodes[i+1:i+5]
        for neighbor in neighbors:
            if neighbor in node_index:
                # Ensure neighbor exists
                edge_list.append([node_index[node], node_index[neighbor]])

    # Convert edge list to tensor
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

    # Create PyTorch Geometric Data object
    data = Data(x=node_features_tensor, edge_index=edge_index, y=labels_tensor)

    # Normalize the adjacency matrix
    edge_index, _ = remove_self_loops(data.edge_index)
    edge_index, _ = add_remaining_self_loops(edge_index, num_nodes=data.num_nodes)
    row, col = edge_index
    deg = degree(row, data.num_nodes, dtype=data.x.dtype)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
    norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
    data.norm = norm

    # Construct the adjacency matrix
    values = torch.ones(edge_index.shape[1])
    adj_matrix = torch.sparse_coo_tensor(edge_index, values, (data.num_nodes, data.num_nodes))

    return data, adj_matrix

data, adj_matrix = create_graph_pyg_usable(master_df)


def train_gcn_model(model, data, adj_matrix, epochs=20, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.MSELoss()

    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(data.x, adj_matrix)
        print(output)
        print("-------")
        print(data.y)
        loss = criterion(output.view(-1), data.y)  # Flatten output for regression
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

# Assuming data is the direct graph data provided by Torch Geometric
data, adj_matrix = create_graph_pyg_usable(master_df)

# Create GCN model for regression
gcn_model = GCN(input_dim=data.x.size(1), hidden_dim=16, output_dim=1)

# Train GCN model for regression
train_gcn_model(gcn_model, data, adj_matrix, epochs=20)


tensor([[0.0000e+00],
        [1.9706e+16],
        [1.6043e+17],
        ...,
        [       nan],
        [       nan],
        [       nan]], grad_fn=<ReluBackward0>)
-------
tensor([0., 0., 0.,  ..., 0., 0., 0.])
Epoch 1, Loss: nan
tensor([[nan],
        [nan],
        [nan],
        ...,
        [nan],
        [nan],
        [nan]], grad_fn=<ReluBackward0>)
-------
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([[nan],
        [nan],
        [nan],
        ...,
        [nan],
        [nan],
        [nan]], grad_fn=<ReluBackward0>)
-------
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([[nan],
        [nan],
        [nan],
        ...,
        [nan],
        [nan],
        [nan]], grad_fn=<ReluBackward0>)
-------
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([[nan],
        [nan],
        [nan],
        ...,
        [nan],
        [nan],
        [nan]], grad_fn=<ReluBackward0>)
-------
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([[nan],
        [nan],
        [nan],
        ...,
 