In [1]:
# General imports
import yaml
import numpy as np
import torch
import fsspec

# Current project imports
import goli
from goli.config._loader import load_datamodule, load_trainer
from goli.trainer.predictor import PredictorModule


Using backend: pytorch


In [2]:
# Path containing the model and its configurations
MODEL_PATH = "gs://goli-private/pretrained-models/htsfp-pcba-24M"
MODEL_FILE = f"{MODEL_PATH}/model.ckpt"
CONFIG_FILE = f"{MODEL_PATH}/configs.yaml"

# Path containing the SMILES data to infer
SMILES_DF_PATH = f"gs://goli-public/datasets/goli-zinc-bench-gnn/smiles_score.csv.gz"
SMILES_COL = "SMILES"

# Number of layers to drop when inferring the fingerprints
NUM_LAYERS_TO_DROP = 3

In [3]:
# Load the configuration file of the trained model
with fsspec.open(CONFIG_FILE, "rb") as f:
    cfg = yaml.safe_load(f)

# Overwrite configurations of the datamodule
cfg["datamodule"]["module_type"] = "DGLFromSmilesDataModule"
args = cfg["datamodule"]["args"]
cfg["datamodule"]["args"] = {
        "df_path": SMILES_DF_PATH,
        "smiles_col": SMILES_COL,
        "label_cols": [],
        "featurization": args["featurization"],
    }

# Load and initialize the dataset
datamodule = load_datamodule(cfg)

In [4]:
# Load the model, drop the layers, and load the trainer
predictor = PredictorModule.load_from_checkpoint(MODEL_FILE)
predictor.model.drop_post_nn_layers(num_layers_to_drop=NUM_LAYERS_TO_DROP)
trainer = load_trainer(cfg)

predictor

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


DGL_GNN
---------
    pre-NN(depth=3, ResidualConnectionSimple(skip_steps=1))
        [FCLayer[69 -> 512 -> 512 -> 512]
    
    pre-NN-edges(depth=2, ResidualConnectionSimple(skip_steps=1))
        [FCLayer[16 -> 32 -> 32]
    
    GNN(depth=8, ResidualConnectionSimple(skip_steps=1))
        DGNMessagePassingLayer[512 -> 512 -> 512 -> 512 -> 512 -> 512 -> 512 -> 512 -> 1200]
        -> Pooling(['sum', 'max']) -> FCLayer(2400 -> 1200, activation=None)
    
    post-NN(depth=3, ResidualConnectionSimple(skip_steps=1))
        [FCLayer[1200 -> 1200 -> 1200 -> 689]

  | Name               | Type           | Params
------------------------------------------------------
0 | model              | FullDGLNetwork | 20.0 M
1 | model.pre_nn       | FeedForwardNN  | 564 K 
2 | model.pre_nn_edges | FeedForwardNN  | 1.7 K 
3 | model.gnn          | FeedForwardDGL | 19.5 M
4 | model.post_nn      | FeedForwardNN  | 0     
5 | loss_fun           | BCELoss        | 0     
---------------------------------

In [5]:
# Run the model prediction, and concatenate the batched results
preds = trainer.predict(model=predictor, datamodule=datamodule)
if isinstance(preds[0], torch.Tensor):
    preds = [p.detach().cpu().numpy() for p in preds]
preds = np.concatenate(preds, axis=0)

preds

2021-07-09 13:35:05.180 | INFO     | goli.data.datamodule:prepare_data:355 - Prepare dataset with 12000 data points.
Missing logger folder: logs/htsfp-pcba\default
Predicting: 100%|██████████| 750/750 [04:49<00:00,  2.59it/s]


array([[-0.5803324 ,  0.9320879 ,  0.36963642, ...,  0.0749743 ,
        -0.51820433,  0.9317135 ],
       [-1.373152  , -2.6965613 ,  1.6978087 , ...,  0.5530317 ,
        -0.44438946,  0.3196116 ],
       [-1.4481115 , -0.613314  ,  0.33443648, ..., -0.7772472 ,
         0.6488527 ,  0.94782364],
       ...,
       [ 0.48269045, -2.212933  , -0.76536405, ...,  0.25664926,
        -0.23683667, -1.0514059 ],
       [-0.26671052, -2.3024874 , -1.1361103 , ...,  0.42454743,
         0.57352805,  0.4816807 ],
       [-0.6714377 , -0.7725297 , -1.1829937 , ...,  0.8448169 ,
        -0.6167966 ,  0.9792832 ]], dtype=float32)

In [6]:
preds.shape

(12000, 1200)