1. run all models - extract val and test set
2. do mean of val and test set

In [1]:
import pandas as pd
from ogb.lsc import PygPCQM4MDataset, PCQM4MEvaluator
import numpy as np

from torch_geometric.data import DataLoader
from tqdm import tqdm
import torch

import os

Using backend: pytorch


In [2]:
"""
LOAD HYPERPARAMETERS
"""
# get args from main_gnn CLI
class Argument(object):
    name = "args"
    
args = Argument()
args.batch_size = 256
args.num_workers = 0
args.num_layers = 5
args.emb_dim = 600
args.drop_ratio = 0
args.graph_pooling = "sum"
args.device = 0

shared_params = {
    'num_layers': args.num_layers,
    'emb_dim': args.emb_dim,
    'drop_ratio': args.drop_ratio,
    'graph_pooling': args.graph_pooling
}

device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

## load data

In [3]:
"""
LOAD DATA
"""
### importing OGB-LSC
from ogb.lsc import PygPCQM4MDataset, PCQM4MEvaluator

dataset = PygPCQM4MDataset(root = 'dataset/')
split_idx = dataset.get_idx_split()

valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers)
print(len(split_idx["train"]), len(split_idx["test"]), len(split_idx["valid"]))

3045360 377423 380670


In [4]:
def get_prediction(model, loader):
    y_true = []
    y_pred = []
    for step, batch in enumerate(tqdm(loader, desc="Iteration")):

        # put batch to cuda
        batch = batch.to(device)

        pred = model(batch)
        pred = pred.view(-1)    
        # collate prediction
        pred_np = pred.detach().cpu().numpy()
        y_pred.extend(pred_np)

        # collate label
        label = batch.y.detach().cpu().numpy()
        y_true.extend(label)
        
    return y_true, y_pred

## pygeom models

In [5]:
"""
gin-virtual-bayes-lastLayer
"""
from gnn import BayesianGNN

args.checkpoint_dir = "models/gin-virtual-bnn-lastLayer/checkpoint"

"""
LOAD Checkpoint data
"""
checkpoint = torch.load(os.path.join(args.checkpoint_dir, 'checkpoint.pt'))

gnn_name = "gin-virtual-bnn-lastLayer"
gnn_type = "gin"
virtual_node = True

model = BayesianGNN(gnn_type = gnn_type, virtual_node = virtual_node, last_layer_only=True, **shared_params).to(device)
model.load_state_dict(checkpoint["model_state_dict"], strict=True)
model.state_dict()
# set model to evaluation-only
model.eval()
print(gnn_name, checkpoint["best_val_mae"])

"""
get prediction for val
"""
y_true, y_pred = get_prediction(model, valid_loader)    
evaluator = PCQM4MEvaluator()
input_dict = {"y_true": torch.tensor(y_true), "y_pred": torch.tensor(y_pred)}
print("BayesianGNN", evaluator.eval(input_dict)["mae"])

# save results to dataframe
ginBNNDF = pd.DataFrame(zip(split_idx["valid"].cpu().tolist(), y_true, y_pred), columns=["molecule_idx", "label", "gin-pred"])

"""
get prediction for test
"""
_, ginBNN_test_pred = get_prediction(model, test_loader)    
ginBNN_test_pred = np.array(ginBNN_test_pred)
ginBNN_test_pred.shape

Iteration:   0%|                                                                              | 0/1487 [00:00<?, ?it/s]

gin-virtual-bnn-lastLayer 0.14056678116321564


Iteration: 100%|███████████████████████████████████████████████████████████████████| 1487/1487 [02:59<00:00,  8.27it/s]


BayesianGNN 0.14057622849941254


Iteration: 100%|███████████████████████████████████████████████████████████████████| 1475/1475 [03:11<00:00,  7.72it/s]


(377423,)

In [6]:
"""
gin-virtual
"""
from gnn import GNN

args.checkpoint_dir = "models/gin-virtual/checkpoint"

"""
LOAD Checkpoint data
"""
checkpoint = torch.load(os.path.join(args.checkpoint_dir, 'checkpoint.pt'))

gnn_name = "gin-virtual"
gnn_type = "gin"
virtual_node = True

model = GNN(gnn_type = gnn_type, virtual_node = virtual_node, **shared_params).to(device)
model.load_state_dict(checkpoint["model_state_dict"], strict=True)
model.state_dict()
# set model to evaluation-only
model.eval()
print(gnn_name, checkpoint["best_val_mae"])

"""
get prediction for val
"""
y_true, y_pred = get_prediction(model, valid_loader)    
evaluator = PCQM4MEvaluator()
input_dict = {"y_true": torch.tensor(y_true), "y_pred": torch.tensor(y_pred)}
print("BayesianGNN", evaluator.eval(input_dict)["mae"])

# save results to dataframe
ginDF = pd.DataFrame(zip(split_idx["valid"].cpu().tolist(), y_true, y_pred), columns=["molecule_idx", "label", "gin-pred"])

"""
get prediction for test
"""
_, gin_test_pred = get_prediction(model, test_loader)  
gin_test_pred = np.array(gin_test_pred)
gin_test_pred.shape

Iteration:   0%|                                                                              | 0/1487 [00:00<?, ?it/s]

gin-virtual 0.14212889969348907


Iteration: 100%|███████████████████████████████████████████████████████████████████| 1487/1487 [02:06<00:00, 11.71it/s]


BayesianGNN 0.14212889969348907


Iteration: 100%|███████████████████████████████████████████████████████████████████| 1475/1475 [02:12<00:00, 11.10it/s]


(377423,)

## load dgl models

In [16]:
# get valid result
ginDiffPoolDF = pd.read_csv("../pcqm4m-dgl/models/gin-virtual-diffpool/gin-virtual-diffPool-validResult.csv")

# get test result
with open('../pcqm4m-dgl/models/gin-virtual-diffpool/gin-virtual-diffpool-testResult.npy', 'rb') as f:
    ginDiffPool_test_pred = np.load(f)

## ensembling

In [17]:
ginDF.head()

Unnamed: 0,molecule_idx,label,gin-pred
0,3045360,4.870838,4.860394
1,3045361,5.322547,5.343346
2,3045362,5.240913,4.994455
3,3045363,5.227307,5.107187
4,3045364,4.868117,5.044456


In [18]:
ginBNNDF.head()

Unnamed: 0,molecule_idx,label,gin-pred
0,3045360,4.870838,4.828844
1,3045361,5.322547,5.327822
2,3045362,5.240913,5.105467
3,3045363,5.227307,5.183986
4,3045364,4.868117,5.031127


In [19]:
ginDiffPoolDF.head()

Unnamed: 0.1,Unnamed: 0,molecule_idx,label,gin-diffpool-pred
0,0,3045360,4.870838,4.857602
1,1,3045361,5.322547,5.303278
2,2,3045362,5.240913,5.092427
3,3,3045363,5.227307,5.085762
4,4,3045364,4.868117,5.007533


In [20]:
"""
valid ensembling
"""
for name, pairs in [
    ["gin", (ginDF["gin-pred"], ginDF["gin-pred"])],
    ["gin-bnn", (ginBNNDF["gin-pred"], ginBNNDF["gin-pred"])],
    ["gin-diffpool", (ginDiffPoolDF["gin-diffpool-pred"], ginDiffPoolDF["gin-diffpool-pred"])],
    ["gin + gin-bnn", (ginDF["gin-pred"], ginBNNDF["gin-pred"])],
    ["gin + gin-diffpool", (ginDF["gin-pred"], ginDiffPoolDF["gin-diffpool-pred"])],
    ["gin-bnn + gin-diffpool", (ginBNNDF["gin-pred"], ginDiffPoolDF["gin-diffpool-pred"])],
    ["gin + gin-bnn + gin-diffpool", (ginDF["gin-pred"], ginBNNDF["gin-pred"], ginDiffPoolDF["gin-diffpool-pred"])],
]:

    mean_pred = np.mean(list(zip(*pairs)), axis=1)
    evaluator = PCQM4MEvaluator()
    input_dict = {"y_true": torch.tensor(y_true), "y_pred": torch.tensor(mean_pred)}
    print(name, evaluator.eval(input_dict)["mae"])


gin 0.1421289058183607
gin-bnn 0.14057623964997648
gin-diffpool 0.1370588209054448
gin + gin-bnn 0.13506544059592193
gin + gin-diffpool 0.1338790091397235
gin-bnn + gin-diffpool 0.13302038444807135
gin + gin-bnn + gin-diffpool 0.13183168120432173


In [21]:
"""
test ensembling - and save
"""

args.save_test_dir = "models/ensembling-gin-virtual-diffpool-bayesLastLayer"

pairs = (ginBNN_test_pred, gin_test_pred, ginDiffPool_test_pred)
ensemble_pred = np.mean(list(zip(*pairs)), axis=1)

evaluator.save_test_submission({'y_pred': ensemble_pred}, args.save_test_dir)


In [22]:
len(ensemble_pred)

377423