# Benchmark for GCN

## Read data

In [4]:
from torch_geometric.datasets import PPI
import torch_geometric.transforms as T
from torch_geometric.utils import add_self_loops
from torch_geometric.loader import DataLoader
from GNNTraining import GNNTraining
import torch

TRAIN = "train"
VAL = "val"
TEST = "test"
set_names = [TRAIN, TEST, VAL]

train_dataset = PPI(root='/tmp/PPI', split="train")
val_dataset = PPI(root='/tmp/PPI', split="val")
test_dataset = PPI(root='/tmp/PPI', split="test")

train_loader = iter(DataLoader(train_dataset, batch_size=len(train_dataset)))
val_loader = iter(DataLoader(val_dataset, batch_size=len(val_dataset)))
test_loader = iter(DataLoader(test_dataset, batch_size=len(test_dataset)))

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

train_set = next(train_loader)
test_set = next(test_loader)
val_set = next(val_loader)

sets = dict()
sets[TRAIN] = train_dataset
sets[TEST] = test_dataset
sets[VAL] = val_dataset

## Define GNN architecture

In [5]:
import torch
from torch import nn
from torch_geometric.nn import GCNConv
from torch.nn import Linear
import torch.nn.functional as F
class GNN(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout = .2, normalize = False, add_self_loops = True):
        super(GNN, self).__init__()
        
        self.conv1 = GCNConv(in_dim, hidden_dim, normalize = normalize, add_self_loops=add_self_loops)
        self.lin1 = Linear(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim, normalize = normalize, add_self_loops=add_self_loops)
        self.lin2 = Linear(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, out_dim, normalize = normalize, add_self_loops=add_self_loops)
        self.lin3 = Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x, edge_index):
        x = self.dropout(x)
        x = self.conv1(x, edge_index) + self.lin1(x)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index) + self.lin2(x)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.conv3(x, edge_index) + self.lin3(x)
        return x

## Hyperparameter tuning for GNN

In [6]:
from sklearn.model_selection import ParameterGrid
from tqdm.notebook import tqdm

space = {
    "WEIGHT_DECAYS": [0],
    "DROPOUT": [0.0, 0.2, 0.4],
    "HIDDEN_DIMS": [128, 256, 512],
    "LEARNING_RATES": [1e-4, 5e-3, 1e-3, 5e-4],
    "SELF_LOOPS": [True, False],
    "NORMALIZE": [True, False],
}

param_grid = ParameterGrid(space)
best_params_overall = None
best_val_overall = float("inf")

for params in tqdm(param_grid.__iter__()):    
    gnnTraining = GNNTraining(device = device,
            GNN = GNN,
            sets = sets,
            hidden_dim = params["HIDDEN_DIMS"],
            lr = params["LEARNING_RATES"],
            dropout = params["DROPOUT"],
            weight_decay=params["WEIGHT_DECAYS"],
            epochs = 1000,
            kwargs = {"normalize": params["NORMALIZE"], 'add_self_loops': params["SELF_LOOPS"]})
    gnnTraining.train()
    
    if gnnTraining.best_val_loss <= best_val_overall:
        print("Updated params")
        best_val_overall = gnnTraining.best_val_loss
        best_params_overall = params

0it [00:00, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Training & Evaluation

In [4]:
from GNNTraining import GNNTraining
from GNNEvaluate import GNNEvaluate 

gnnTraining = GNNTraining(device = device,
            GNN = GNN,
            sets = sets,
            hidden_dim = 512,
            lr = 5e-3,
            dropout = 0.2,
            weight_decay=0.0,
            epochs = 1000,
            kwargs = {"normalize": True, 'add_self_loops': True})
best_model = gnnTraining.train()

gnnEvaluate = GNNEvaluate(device = device,
            sets = sets)
gnnEvaluate.evaluate(best_model)

  0%|          | 0/1000 [00:00<?, ?it/s]

0.9600572791884782

## Standard deviation over 10 runs

In [6]:
from GNNTraining import GNNTraining
from GNNEvaluate import GNNEvaluate 
from tqdm.notebook import tqdm

times = []
scores = []
for i in tqdm(range(10)):
    gnnTraining = GNNTraining(device = device,
             GNN = GNN,
            sets = sets,
            hidden_dim = 512,
            lr = 5e-3,
            dropout = 0.2,
            weight_decay=0.0,
            epochs = 1000,
            kwargs = {"normalize": True, 'add_self_loops': True})
    best_model = gnnTraining.train()
    times.append(gnnTraining.training_time)
    
    gnnEvaluate = GNNEvaluate(device = device,
                sets = sets)
    score = gnnEvaluate.evaluate(best_model)
    scores.append(score)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [7]:
import numpy as np
print(f"F1-score: {np.mean(scores)} +- {np.std(scores)}; {np.mean(times)}")

F1-score: 0.960577126981376 +- 0.000838829249817186; 343.01980459690094


In [13]:
class GCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout = .2, normalize = False, add_self_loops = True):
        super(GCN, self).__init__()
        hidden_dim = int(hidden_dim)
        self.conv1 = GCNConv(in_dim, hidden_dim, normalize = normalize, add_self_loops=add_self_loops)
        self.conv2 = GCNConv(hidden_dim, hidden_dim, normalize = normalize, add_self_loops=add_self_loops)
        self.conv3 = GCNConv(hidden_dim, out_dim, normalize = normalize, add_self_loops=add_self_loops)
        self.lin1 = Linear(in_dim, hidden_dim)
        self.lin2 = Linear(hidden_dim, hidden_dim)
        self.lin3 = Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x, edge_index):
        x = self.dropout(x)
        x = self.conv1(x, edge_index) +self.lin1(x)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)+self.lin2(x)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.conv3(x, edge_index)+self.lin3(x)
        return x

In [25]:
import numpy as np
from sklearn.metrics import f1_score
def train_val_data(train_data, manual_seed = None, train_size = 0.8):
    if manual_seed:
        torch.manual_seed(manual_seed)
    train_index = torch.arange(len(train_data))
    min = int(train_size*train_index.shape[0])
    rand_train_index = torch.randperm(train_index.shape[0])
    rand_train_index_train_index = rand_train_index[:min]
    rand_train_index_val_index = rand_train_index[min:]

    new_train_idx = train_index[rand_train_index_train_index]
    new_val_idx = train_index[rand_train_index_val_index]

    return train_data[new_train_idx.tolist()], train_data[new_val_idx.tolist()]
    
def evaluate_fun(fitted_model, data):
    preds = []
    y = []
    with torch.inference_mode():
        fitted_model.eval()
        for graph in data:
            graph = graph.to(device)
            out = fitted_model(graph.x, graph.edge_index)
            preds.append((out > 0).float())
            y.append(graph.y)
            graph = graph.cpu()
    preds = (torch.cat(preds).cpu().detach() > 0)
    y = torch.cat(y).cpu()
    
    return f1_score(y, preds, average = "micro")

def train_fun(data, hyperparameters):
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction="mean")
    start = time.time()
    scores = []
    lr = hyperparameters['lr']
    weight_decay = hyperparameters["weight_decay"]
    
    filtered_keys = list(filter(lambda key: key not in ["weight_decay", "lr"], hyperparameters.keys()))
    model_hyperparams = {key: hyperparameters[key] for key in filtered_keys}
    model = GCN(in_dim=data.x.shape[-1], **model_hyperparams).to(device)
    optim = torch.optim.Adam(model.parameters(), lr = lr, weight_decay=weight_decay)
    never_breaked = True
    train_data, val_data = train_val_data(data, 42, 0.8)
    for epoch in range(1000):
        acc_loss = 0
        for graph in train_data:
            model.train()
            graph = graph.to(device)
            out = model(graph.x, graph.edge_index)
            loss = loss_fn(out, graph.y)
            acc_loss+=loss.item()
            optim.zero_grad()
            loss.backward()
            optim.step()
            graph = graph.cpu()
        print(acc_loss)
        score = evaluate_fun(model, val_data)
        scores.append(score)
        worst_score = float("-inf")
        mean_score = np.mean(scores[-(100 + 1):]) if len(scores) > 100 else worst_score
        not_improved = score < mean_score
        
        if epoch > (100) and not_improved:
            never_breaked = False
            break
    return model

In [26]:
import time
model = train_fun(train_dataset, {'add_self_loops': True,
  'dropout': 0.0,
  'hidden_dim': 1028,
  'lr': 3e-4,
  'normalize': True,
  'out_dim': 121,
  'weight_decay':0 })

9.587925016880035
8.736396133899689
8.31795871257782
8.104421705007553
7.995105862617493
7.908246874809265
7.830211102962494
7.75655409693718
7.685425758361816
7.614999949932098
7.543794393539429
7.4707159996032715
7.3952131271362305
7.31724551320076
7.2372488379478455
7.155951887369156
7.07419815659523
6.992789179086685
6.912252247333527
6.832769423723221
6.754324048757553
6.676917284727097
6.600549787282944
6.525219887495041
6.450927019119263
6.377662509679794
6.305420845746994
6.234208881855011
6.164037615060806
6.094921201467514
6.026863664388657
5.959864109754562
5.893921583890915
5.829029619693756
5.765182316303253
5.702373594045639
5.640603303909302
5.579877495765686
5.520188510417938
5.461531788110733
5.403891950845718
5.3472491800785065
5.291594505310059
5.236912101507187
5.183179318904877
5.1303883492946625
5.0785326063632965
5.027594298124313
4.977543145418167
4.928355187177658
4.880017548799515
4.832514196634293
4.785819888114929
4.739917427301407
4.6947861313819885
4.65040

In [27]:
evaluate_fun(model, test_dataset)

0.9420233472801247

In [28]:
len(train_dataset)

20

In [29]:
len(test_dataset)

2

In [30]:
len(val_dataset)

2