In [None]:
import pandas as pd
import numpy as np
import wandb
import torch
from sklearn.metrics import f1_score

import sys
import os
sys.path.append(os.path.abspath("/home/lideyi/AKI_GNN/notebooks/utils"))
from metrics import performance_per_class, visualize_embeddings

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
# login wandb
wandb.login(key="62d0c78e72de6dacd620fc6d13ebfecfa7ce68a1")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mericli[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/lideyi/.netrc


True

# Read Dataset

In [None]:
onset_df_pilot = pd.read_csv('/blue/yonghui.wu/lideyi/AKI_GNN/raw_data/norm_df_pilot.csv')

# Build PyG Data

In [6]:
from torch_geometric.data import Data
from sklearn.neighbors import kneighbors_graph

In [7]:
feature_columns = [col for col in onset_df_pilot.columns if col not in ['AKI_TARGET', 'TRAIN_SET', 'VAL_SET', 'TEST_SET']]
node_features = onset_df_pilot[feature_columns].copy(deep = True).values
node_labels = onset_df_pilot['AKI_TARGET'].copy(deep = True).values
train_mask = onset_df_pilot['TRAIN_SET'].copy(deep = True).values
val_mask = onset_df_pilot['VAL_SET'].copy(deep = True).values
test_mask = onset_df_pilot['TEST_SET'].copy(deep = True).values

In [8]:
# Generate a k-NN graph (e.g., k=5), note that the returned matrix is not symmetric
k = 5
A = kneighbors_graph(node_features, k, mode='connectivity', metric = 'cosine', include_self=False, n_jobs = -1).toarray()
# make adjacent matrix symmetric
A = A + A.T
# Ensure binary adjacent matrix
A = (A > 0).astype(int)
edge_index = (torch.tensor(A) > 0).nonzero().t().contiguous()
edge_index = edge_index.to(torch.long)

In [9]:
data = Data(x = torch.tensor(node_features, dtype = torch.float), 
            edge_index = edge_index, y = torch.tensor(node_labels, dtype = torch.long), 
            num_classes = len(np.unique(node_labels)),
            train_mask = torch.tensor(train_mask, dtype = torch.bool), 
            val_mask = torch.tensor(val_mask, dtype = torch.bool), 
            test_mask = torch.tensor(test_mask, dtype = torch.bool))

In [10]:
# analyse the graph
print(f'Number of features: {data.num_features}')
print(f'Number of classes: {data.num_classes}')
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Number of features: 67
Number of classes: 4
Number of nodes: 41467
Number of edges: 311452
Average node degree: 7.51
Number of training nodes: 23486
Training node label rate: 0.57
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [11]:
# turn the data into loader
from torch_geometric.loader import ClusterData, ClusterLoader

torch.manual_seed(888)
cluster_data = ClusterData(data, num_parts=128)  # 1. Create subgraphs.
data_loader = ClusterLoader(cluster_data, batch_size=32, shuffle=True)  # 2. Stochastic partioning scheme.

for step, sub_data in enumerate(data_loader):
    print(f'Number of nodes in one batch: {sub_data.num_nodes}')
    print(sub_data)
    break

Number of nodes in one batch: 10432
Data(x=[10432, 67], y=[10432], num_classes=4, train_mask=[10432], val_mask=[10432], test_mask=[10432], edge_index=[2, 71312])


Computing METIS partitioning...
Done!


# GCN

In [12]:
from torch_geometric.nn import GATConv

In [13]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, n_class, hidden_dims, dropout):
        super().__init__()
        torch.manual_seed(888)
        self.conv1 = GATConv(input_dim, hidden_dims)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(dropout)
        self.conv2 = GATConv(hidden_dims, hidden_dims)
        self.conv3 = GATConv(hidden_dims, hidden_dims)
        self.linear = torch.nn.Linear(hidden_dims, n_class)

    def forward(self, x, edge_index):
        # First GCN layer
        x = self.conv1(x, edge_index)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Second GCN layer
        x = self.conv2(x, edge_index)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Third GCN layer
        x = self.conv3(x, edge_index)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Fully connected layer for output
        x = self.linear(x)
        return x

In [14]:
model = GCN(input_dim = data.num_features, n_class = data.num_classes, hidden_dims=128, dropout=0.2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train_epoch():
      model.train()

      for sub_data in data_loader:  # Iterate over each mini-batch.
            sub_data = sub_data.to(device)
            optimizer.zero_grad()  # Clear gradients.
            out = model(sub_data.x, sub_data.edge_index)  # Perform a single forward pass.
            loss = criterion(out[sub_data.train_mask], sub_data.y[sub_data.train_mask])  # Compute the loss solely based on the training nodes.
            loss.backward()  # Derive gradients.
            optimizer.step()  # Update parameters based on gradients.
      return loss

def test_epoch():
      model.eval()  # Set the model to evaluation mode.
      
      # Store predictions and ground truths for each mask.
      y_true_masks = {key: [] for key in ["train", "val"]}
      y_pred_masks = {key: [] for key in ["train", "val"]}
      
      with torch.no_grad():  # Disable gradient computation for evaluation.
            for sub_data in data_loader:  # Iterate over mini-batches.
                  sub_data = sub_data.to(device)
                  out = model(sub_data.x, sub_data.edge_index)  # Forward pass.
                  y_pred = out.argmax(dim=1)  # Use the class with the highest probability.
                  
                  # Collect predictions and ground truths for each mask.
                  for mask, key in zip(
                  [sub_data.train_mask, sub_data.val_mask], 
                  ["train", "val"]):
                        y_pred_masks[key].append(y_pred[mask].cpu())
                        y_true_masks[key].append(sub_data.y[mask].cpu())
      
      # Compute F1 scores for each mask.
      F1_scores = []
      for key in ["train", "val"]:
            y_true_combined = torch.cat(y_true_masks[key], dim=0).numpy()
            y_pred_combined = torch.cat(y_pred_masks[key], dim=0).numpy()
            F1_scores.append(
                  f1_score(y_true_combined, y_pred_combined, average="macro")
            )
      
      return F1_scores

In [16]:
for epoch in range(1, 501):
      train_loss = train_epoch()
      train_F1, val_F1 = test_epoch()
      if epoch % 100 == 0:
            print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Train F1: {train_F1:.4f}, Val F1: {val_F1:.4f}')

Epoch: 100, Train Loss: 0.4162, Train F1: 0.4995, Val F1: 0.4957
Epoch: 200, Train Loss: 0.3890, Train F1: 0.5539, Val F1: 0.5020
Epoch: 300, Train Loss: 0.3914, Train F1: 0.5301, Val F1: 0.4714
Epoch: 400, Train Loss: 0.4398, Train F1: 0.6032, Val F1: 0.5096
Epoch: 500, Train Loss: 0.3805, Train F1: 0.6010, Val F1: 0.5018


# Performance on Test Set

In [18]:
y_test_pred = []
y_test_pred_proba = []
y_test_true = []

model.eval()
with torch.no_grad():
    for sub_data in data_loader:
        sub_data = sub_data.to(device)
        out = model(sub_data.x, sub_data.edge_index)
        y_test_pred.append(out[sub_data.test_mask].argmax(dim=1).cpu())
        y_test_pred_proba.append(out[sub_data.test_mask].softmax(dim=1).cpu())
        y_test_true.append(sub_data.y[sub_data.test_mask].cpu())

y_test_pred = torch.cat(y_test_pred, dim=0).numpy()
y_test_pred_proba = torch.cat(y_test_pred_proba, dim=0).numpy()
y_test_true = torch.cat(y_test_true, dim=0).numpy()
performance_per_class(y_test_true, y_test_pred, y_test_pred_proba)

Unnamed: 0,precision,recall,f1-score,AUROC,AUPRC
0,0.897743,0.963988,0.929687,0.821051,0.950081
1,0.389359,0.230329,0.289438,0.745745,0.290939
2,0.299435,0.169329,0.216327,0.897109,0.231233
3,0.521472,0.408654,0.458221,0.975236,0.454065
