# GROVER Feature Generation

This notebook outlines how the GROVER features were generated for the host and guest pairs. Due to package compatibility issues, this notebook cannot run with the provided virtual environment. To use this notebook, create a separate virtual environment that contains the "grover" library and associated dependencies. For more information on the "grover" library, see the below resource.

[GROVER Repository](https://github.com/tencent-ailab/grover)


For external dataset GROVER feature generation, file paths can be replaced with the appropriate .csv files.

In [None]:
import torch
from grover.task.fingerprint import generate_fingerprints
import numpy as np
from grover.util.utils import create_logger

# Step 1: Load the checkpoint (assumes you have both 'state_dict' and 'args' saved)
checkpoint_path = "grover_large.pt"
checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)

# Print the keys and args in the checkpoint
print(checkpoint.keys())
print(checkpoint["args"])
print(checkpoint["state_dict"].keys())

dict_keys(['state_dict', 'args'])
Namespace(embedding_output_type='both', backbone='dualtrans', hidden_size=1200, bias=False, depth=6, activation='PReLU', undirected=False, weight_decay=2e-07, select_by_loss=True, skip_epoch=0, no_attach_fea=True, dist_coff=0.1, bond_drop_rate=0, input_layer='fc', num_attn_head=4, num_mt_block=1, dense=False, self_attention=False, fine_tune_coff=1)
odict_keys(['grover.encoders.edge_blocks.0.heads.0.mpn_q.act_func.weight', 'grover.encoders.edge_blocks.0.heads.0.mpn_q.W_h.weight', 'grover.encoders.edge_blocks.0.heads.0.mpn_k.act_func.weight', 'grover.encoders.edge_blocks.0.heads.0.mpn_k.W_h.weight', 'grover.encoders.edge_blocks.0.heads.0.mpn_v.act_func.weight', 'grover.encoders.edge_blocks.0.heads.0.mpn_v.W_h.weight', 'grover.encoders.edge_blocks.0.heads.1.mpn_q.act_func.weight', 'grover.encoders.edge_blocks.0.heads.1.mpn_q.W_h.weight', 'grover.encoders.edge_blocks.0.heads.1.mpn_k.act_func.weight', 'grover.encoders.edge_blocks.0.heads.1.mpn_k.W_h.weight'

In [5]:
# Print the dimensions of each weight tensor in the model's state_dict
if "state_dict" in model:
    print("Model weight tensor shapes:")
    for key, value in model["state_dict"].items():
        if hasattr(value, "shape"):
            print(f"{key}: {tuple(value.shape)}")
        else:
            print(f"{key}: Not a tensor or missing shape attribute")
else:
    print("No state_dict found in the model.")

Model weight tensor shapes:
grover.encoders.edge_blocks.0.heads.0.mpn_q.act_func.weight: (1,)
grover.encoders.edge_blocks.0.heads.0.mpn_q.W_h.weight: (1200, 1200)
grover.encoders.edge_blocks.0.heads.0.mpn_k.act_func.weight: (1,)
grover.encoders.edge_blocks.0.heads.0.mpn_k.W_h.weight: (1200, 1200)
grover.encoders.edge_blocks.0.heads.0.mpn_v.act_func.weight: (1,)
grover.encoders.edge_blocks.0.heads.0.mpn_v.W_h.weight: (1200, 1200)
grover.encoders.edge_blocks.0.heads.1.mpn_q.act_func.weight: (1,)
grover.encoders.edge_blocks.0.heads.1.mpn_q.W_h.weight: (1200, 1200)
grover.encoders.edge_blocks.0.heads.1.mpn_k.act_func.weight: (1,)
grover.encoders.edge_blocks.0.heads.1.mpn_k.W_h.weight: (1200, 1200)
grover.encoders.edge_blocks.0.heads.1.mpn_v.act_func.weight: (1,)
grover.encoders.edge_blocks.0.heads.1.mpn_v.W_h.weight: (1200, 1200)
grover.encoders.edge_blocks.0.heads.2.mpn_q.act_func.weight: (1,)
grover.encoders.edge_blocks.0.heads.2.mpn_q.W_h.weight: (1200, 1200)
grover.encoders.edge_blocks

In [7]:
# Print the values of the weights in the feed-forward network (ffn) layers of the loaded model

# Ensure 'state_dict' exists in the model dictionary
if "state_dict" not in model:
    raise KeyError("No 'state_dict' found in the loaded model.")

# Iterate through all keys in the state_dict and print weights for ffn layers
print("Feed-forward network (ffn) layer weights:")
for key, tensor in model["state_dict"].items():
    # Check if the key corresponds to an ffn layer and is a weight tensor
    if "ffn" in key and ".weight" in key:
        print(f"Layer: {key}")
        # Validate tensor type
        if hasattr(tensor, "numpy"):
            print(tensor.numpy())
        else:
            print("Tensor does not have a numpy() method.")
        print("-" * 60)

Feed-forward network (ffn) layer weights:
Layer: grover.encoders.ffn_atom_from_atom.W_1.weight
[[ 4.0252121e-06  8.0301032e-07  1.6447584e-05 ...  8.7161586e-03
   2.0930033e-02 -8.8472711e-03]
 [-3.7588234e-05  3.7492387e-06 -2.2893100e-06 ...  1.4373798e-02
  -2.8613904e-02 -2.6396010e-03]
 [-3.0557545e-05 -1.0276249e-06 -5.7388256e-06 ... -1.9698452e-02
  -3.6661640e-02 -1.4618294e-02]
 ...
 [-4.2463827e-04  4.3717564e-06  1.4851106e-05 ...  1.6418254e-02
   1.0370712e-02 -1.6428720e-02]
 [ 8.7928111e-06 -2.9649266e-06 -8.3153145e-06 ... -3.2102547e-04
  -1.2141189e-02  1.7105579e-02]
 [ 2.3316417e-03 -1.9217109e-06 -1.3187578e-03 ...  1.0773701e-02
  -1.5966428e-02  1.1881495e-02]]
------------------------------------------------------------
Layer: grover.encoders.ffn_atom_from_atom.W_2.weight
[[-0.00786536 -0.00764744 -0.00880815 ... -0.00339244 -0.00151713
   0.00570643]
 [ 0.00295709 -0.01201642  0.02430255 ...  0.00515964  0.02019144
   0.01501802]
 [ 0.01807962  0.00845418 -0.

In [None]:
from grover.model.models import GroverFpGeneration, GroverFinetuneTask

# Step 2: Extract model arguments directly from the checkpoint
args = checkpoint["args"]
print("Model arguments:", args)

# Step 3: Add any missing required fields (optional safety)
args.fingerprint_source = getattr(
    args, "fingerprint_source", "both"
)  # atom / bond / both
args.cuda = getattr(args, "cuda", torch.cuda.is_available())
args.features_dim = getattr(args, "features_dim", 0)
args.dropout = getattr(args, "dropout", 0.0)
args.features_only = getattr(args, "features_only", False)
args.ffn_num_layers = getattr(args, "ffn_num_layers", 2)
args.ffn_hidden_size = getattr(args, "ffn_hidden_size", 200)
args.output_size = getattr(args, "output_size", 0)

# Step 4: Initialize the model
# model = GroverFpGeneration(args)
model = GroverFinetuneTask(args)

# Step 5: Load pretrained weights
missing, unexpected = model.load_state_dict(checkpoint["state_dict"], strict=False)
print("Missing keys:", missing)
print("Unexpected keys:", unexpected)


# Optional: move to GPU
if args.cuda:
    model_host = model.cuda()

print("âœ… Loaded GroverFpGeneration model from checkpoint.")

# Print all model arguments in a readable format
if hasattr(args, "__dict__"):
    print("Loaded model arguments:")
    for key, value in vars(args).items():
        print(f"{key}: {value}")
else:
    print("Model arguments are not in Namespace format.")

Model arguments: Namespace(embedding_output_type='both', backbone='dualtrans', hidden_size=1200, bias=False, depth=6, activation='PReLU', undirected=False, weight_decay=2e-07, select_by_loss=True, skip_epoch=0, no_attach_fea=True, dist_coff=0.1, bond_drop_rate=0, input_layer='fc', num_attn_head=4, num_mt_block=1, dense=False, self_attention=False, fine_tune_coff=1, fingerprint_source='both', cuda=False, features_dim=0, dropout=0.0, features_only=False, ffn_num_layers=2, ffn_hidden_size=200)




AttributeError: 'Namespace' object has no attribute 'dataset_type'

In [19]:
import torch

# Save the model checkpoint with state_dict and args
checkpoint_to_save = {"state_dict": model.state_dict(), "args": args}

save_path = "grover_large_checkpoint.pt"

try:
    torch.save(checkpoint_to_save, save_path)
    print(f"Model checkpoint saved to {save_path}")
except Exception as e:
    print(f"Error saving model checkpoint: {e}")

Model checkpoint saved to grover_large_checkpoint.pt


### Try using Grover fingerprint function

In [None]:
# Import necessary modules
import types
from argparse import Namespace
from task.fingerprint import generate_fingerprints
from grover.util.utils import create_logger
import torch

# Define your arguments (adjust paths as needed)
args = Namespace(
    data_path="host_smiles.csv",  # Path to your SMILES CSV
    output_path="host_fingerprints.npz",  # Output path for fingerprints
    features_path=None,  # Or a list of feature file paths
    fingerprint_source="both",  # "atom", "bond", or "both"
    checkpoint_paths=["grover_large_checkpoint.pt"],  # List with your model checkpoint
    cuda=False,  # Use GPU if available
    batch_size=32,  # Batch size for DataLoader
)

args.fingerprint_source = getattr(
    args, "fingerprint_source", "both"
)  # atom / bond / both
args.cuda = getattr(args, "cuda", torch.cuda.is_available())
args.features_dim = getattr(args, "features_dim", 0)
args.dropout = getattr(args, "dropout", 0.0)

In [4]:
# Create a logger (optional)
logger = create_logger("fingerprint", save_dir=None, quiet=False)

# Ensure 'parser_name' is present in args to avoid AttributeError
if not hasattr(args, "parser_name"):
    args.parser_name = "fingerprint"  # or set to the appropriate parser name

# Ensure 'no_cache' is present in args to avoid AttributeError in mol2graph
if not hasattr(args, "no_cache"):
    args.no_cache = False  # Default to False if not set

# Generate fingerprints
fingerprints = generate_fingerprints(args, logger)

Total size = 3,459
Generating...


Loading data


Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.W_h.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_k.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_k.W_h.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_v.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_v.W_h.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.1.mpn_q.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.1.mpn_q.W_h.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.1.mpn_k.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.1.mpn_k.W_h.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.1.mpn_v.act_func.weight".
Loading pretr

KeyboardInterrupt: 

In [23]:
# Save fingerprints if you want
import numpy as np

np.savez_compressed(args.output_path, fps=fingerprints)

print(f"Fingerprints generated and saved to {args.output_path}")
print(f"Shape: {np.array(fingerprints).shape}")

Fingerprints generated and saved to host_fingerprints.npz
Shape: (3459, 4800)


In [31]:
# Import necessary modules
import types
from argparse import Namespace
from task.fingerprint import generate_fingerprints
from grover.util.utils import create_logger
import torch

# Define your arguments (adjust paths as needed)
args = Namespace(
    data_path="guest_smiles.csv",  # Path to your SMILES CSV
    output_path="guest_fingerprints.npz",  # Output path for fingerprints
    features_path=None,  # Or a list of feature file paths
    fingerprint_source="both",  # "atom", "bond", or "both"
    checkpoint_paths=["grover_large_checkpoint.pt"],  # List with your model checkpoint
    cuda=False,  # Use GPU if available
    batch_size=32,  # Batch size for DataLoader
)

args.fingerprint_source = getattr(
    args, "fingerprint_source", "both"
)  # atom / bond / both
args.cuda = getattr(args, "cuda", torch.cuda.is_available())
args.features_dim = getattr(args, "features_dim", 0)
args.dropout = getattr(args, "dropout", 0.0)

In [32]:
# Create a logger (optional)
logger = create_logger("fingerprint", save_dir=None, quiet=False)

# Ensure 'parser_name' is present in args to avoid AttributeError
if not hasattr(args, "parser_name"):
    args.parser_name = "fingerprint"  # or set to the appropriate parser name

# Ensure 'no_cache' is present in args to avoid AttributeError in mol2graph
if not hasattr(args, "no_cache"):
    args.no_cache = False  # Default to False if not set

# Generate fingerprints
fingerprints = generate_fingerprints(args, logger)

Total size = 3,459
Total size = 3,459
Total size = 3,459
Total size = 3,459
Total size = 3,459
Total size = 3,459
Generating...
Generating...
Generating...
Generating...
Generating...
Generating...


Loading data


Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.act_func.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.W_h.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.W_h.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.W_h.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.W_h.weight".
Loading pretrained parameter "grover.encoders.edge_blocks.0.heads.0.mpn_q.W_h.weight".
Loading pretr

In [2]:
# Save fingerprints if you want
import numpy as np

np.savez_compressed(args.output_path, fps=fingerprints)

print(f"Fingerprints generated and saved to {args.output_path}")
print(f"Shape: {np.array(fingerprints).shape}")

NameError: name 'args' is not defined

In [34]:
import numpy as np
import pandas as pd

# Concatenate host and guest fingerprints row-wise and save as CSV


# Load host and guest fingerprints from .npz files
host_npz = np.load("host_fingerprints.npz")
guest_npz = np.load("guest_fingerprints.npz")

# Extract fingerprint arrays
host_fps: np.ndarray = host_npz["fps"]
guest_fps: np.ndarray = guest_npz["fps"]

# Validate shapes and types
if not (isinstance(host_fps, np.ndarray) and isinstance(guest_fps, np.ndarray)):
    raise TypeError("Fingerprint arrays must be numpy.ndarray.")
if host_fps.shape != guest_fps.shape:
    raise ValueError(f"Shape mismatch: host {host_fps.shape}, guest {guest_fps.shape}")
if host_fps.shape[0] != 3459 or host_fps.shape[1] != 4800:
    raise ValueError(f"Expected shape (3459, 4800), got {host_fps.shape}")

# Concatenate along columns (axis=1)
combined_fps: np.ndarray = np.concatenate([host_fps, guest_fps], axis=1)

# Convert to DataFrame for CSV output
combined_df: pd.DataFrame = pd.DataFrame(combined_fps)

# Save to CSV
csv_filename: str = "host_guest_fingerprints.csv"
combined_df.to_csv(csv_filename, index=False, float_format="%.8f")

print(
    f"Combined host-guest fingerprints saved to {csv_filename} with shape {combined_df.shape}"
)

Combined host-guest fingerprints saved to host_guest_fingerprints.csv with shape (3459, 9600)
