In [1]:
from ogb.lsc import DglPCQM4MDataset, PCQM4MEvaluator

import argparse
import dgl
import numpy as np
import os
import random
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm

import pandas as pd

from gnn import GNN

import os
import os.path as osp
import shutil
from ogb.utils import smiles2graph
from ogb.utils.torch_util import replace_numpy_with_torchtensor
from ogb.utils.url import decide_download, download_url, extract_zip
import pandas as pd
import numpy as np
from dgl.data.utils import load_graphs, save_graphs, Subset
import dgl
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt

reg_criterion = torch.nn.L1Loss()

Using backend: pytorch


In [2]:
# get args from main_gnn CLI
class Argument(object):
    name = "args"
    
args = Argument()
args.batch_size = 256
args.num_workers = 0
args.num_layers = 5
args.emb_dim = 600
args.drop_ratio = 0.1 #0
args.graph_pooling = "sum"
args.device = 0
args.train_subset = False
args.epochs = 1

# device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
device = "cpu"

shared_params = {
    'num_layers': args.num_layers,
    'emb_dim': args.emb_dim,
    'drop_ratio': args.drop_ratio,
    'graph_pooling': args.graph_pooling
}


## load dataset

In [3]:
from ogb.lsc import DglPCQM4MDataset, PCQM4MEvaluator

In [4]:
def createGraph(smiles):
    graph = smiles2graph(smiles)

    dgl_graph = dgl.graph((graph['edge_index'][0], graph['edge_index'][1]), num_nodes = graph['num_nodes'])
    dgl_graph.edata['feat'] = torch.from_numpy(graph['edge_feat']).to(torch.int64)
    dgl_graph.ndata['feat'] = torch.from_numpy(graph['node_feat']).to(torch.int64)

    return dgl_graph

In [5]:
ROOT = "dataset/pcqm4m_kddcup2021"
filename = "{}/{}".format(ROOT, "raw/data.csv.gz")
data_df = pd.read_csv(filename)

"""
Load Train/Test/Valid split dictionary
"""
# load raw split dict
split_dict = torch.load(osp.join(ROOT, 'split_dict.pt'))
print(split_dict.keys())

# get valid dataframe
valid_df = data_df.loc[split_dict["valid"]]
valid_df["batch"] = valid_df["idx"].apply(lambda x: int(x / args.batch_size))
test_df = data_df.loc[split_dict["test"]]
test_df["batch"] = test_df["idx"].apply(lambda x: int(x / args.batch_size))


dict_keys(['train', 'valid', 'test'])


## models

In [6]:
def get_prediction(model, df):
    model.eval()
    labels = []
    preds = []
    for batch, subDF in tqdm(df.groupby("batch")):
        graphs = [createGraph(smiles) for smiles in subDF["smiles"].values]  
        labels.extend(subDF["homolumogap"].values)

        bg = dgl.batch(graphs)
        bg = bg.to(device)

        x = bg.ndata.pop('feat')
        edge_attr = bg.edata.pop('feat')

        pred = model(bg, x, edge_attr).view(-1, )
        preds.extend(pred.detach().cpu().numpy().tolist())

    preds = np.array(preds)
    labels = np.array(labels)
    
    return preds, labels

In [7]:
from gnn import DiffPoolGNN

args.log_dir = "models/gin-virtual-diffpool/log"
args.checkpoint_dir = "models/gin-virtual-diffpool/checkpoint"
args.save_test_dir = "models/gin-virtual-diffpool/test"

model = DiffPoolGNN(gnn_type='gin', virtual_node=True, **shared_params).to(device)
# add 5 dim
model.gc_after_pool.bn = torch.nn.BatchNorm1d(5).to(device)

# check if checkpoint exist -> load model
checkpointFile = os.path.join(args.checkpoint_dir, 'checkpoint.pt')
print(checkpointFile)
if os.path.exists(checkpointFile):
    # load weights
    print("Loading existing weights from {}".format(checkpointFile))
    checkpointData = torch.load(checkpointFile)
    model.load_state_dict(checkpointData["model_state_dict"], strict=True)
    model.eval()
    
print(args.log_dir, checkpointData["best_val_mae"])




models/gin-virtual-diffpool/checkpoint\checkpoint.pt
Loading existing weights from models/gin-virtual-diffpool/checkpoint\checkpoint.pt
models/gin-virtual-diffpool/log 0.1370588093996048


In [None]:
"""
evaluate
"""
# get val
preds, labels = get_prediction(model, valid_df)

# evaluate
evaluator = PCQM4MEvaluator()
y_true = torch.Tensor(labels)
y_pred = torch.Tensor(preds)
input_dict = {"y_true": y_true, "y_pred": y_pred}
print(args.log_dir, evaluator.eval(input_dict)["mae"])

# save val and test
diffpoolDF = pd.DataFrame(zip(list(valid_df["idx"].values), labels, preds), columns=["molecule_idx", "label", "gin-diffpool-pred"])
diffpoolDF.to_csv("models/gin-virtual-diffpool/gin-virtual-diffpool-validResult.csv")


In [8]:
"""
testing
"""
diffpool_preds, _ = get_prediction(model, test_df)

with open('models/gin-virtual-diffpool/gin-virtual-diffpool-testResult.npy', 'wb') as f:
    np.save(f, diffpool_preds)


100%|██████████████████████████████████████████████████████████████████████████████| 1476/1476 [28:13<00:00,  1.15s/it]


In [9]:
with open('models/gin-virtual-diffpool/gin-virtual-diffpool-testResult.npy', 'wb') as f:
    np.save(f, diffpool_preds)