In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import molfeat
import datamol as dm
from molfeat.store.modelcard import ModelInfo

# for back compat and model transfer

# Model Card

## Defining a model card

In [3]:
# maccs
maccs = ModelInfo(
    name = "maccs",
    inputs = "smiles",
    type="hand-crafted",
    group="rdkit",
    version=0,
    submitter="Datamol",
    description="MACCS keys are 166-bit 2D structure fingerprints that are commonly used for the measure of molecular similarity. They described the presence of key features in molecular graphs",
    representation="vector",
    require_3D=False,
    tags = ["maccs", "fixed", "2D", "binary", 'rdkit'],
    authors= ["MDL Information Systems"],
    reference = "https://doi.org/10.1021/ci010132r" 
)

In [4]:
print(maccs.usage())


import datamol as dm
from molfeat.trans import FPVecTransformer
data = dm.freesolv().iloc[:100]
transformer = FPVecTransformer(kind='maccs')
features = transformer(data["smiles"])



In [5]:
import datamol as dm
from molfeat.trans.fp import FPVecTransformer
data = dm.freesolv()
transformer = FPVecTransformer(kind="maccs")
features = transformer(data["smiles"])

In [6]:
# ECFP
ecfp = ModelInfo(
    name = "ecfp",
    inputs = "smiles",
    type="hashed",
    group="fp",
    version=0,
    submitter="Datamol",
    description="Extended-connectivity fingerprints (ECFPs) are a family of circular fingerprints that are commonly used for the measure of molecular similarity. They are based on the connectivity of atoms in molecular graphs.",
    representation="vector",
    require_3D=False,
    tags = ["ECFP", "fixed", "2D", "binary", 'rdkit', "Morgan"],
    authors= ["David Rogers", "Mathew Hahn"],
    reference = "https://doi.org/10.1021/ci100050t" # the doi is better here.
)


In [7]:
import dgllife

In [8]:
# an example of supervised GIN model
gin_contextpred = ModelInfo(
    name = "gin_supervised_contextpred",
    inputs = "smiles",
    type="pretrained",
    group="dgllife",
    version=0,
    submitter="Datamol",
    description="GIN neural network model pre-trained with supervised learning and context prediction on molecules from ChEMBL.",
    representation="graph",
    require_3D=False,
    tags = ["GIN", "dgl", "pytorch", "graph"],
    authors= ["Weihua Hu", "Bowen Liu", "Joseph Gomes", "Marinka Zitnik", "Percy Liang", "Vijay Pande", "Jure Leskovec"],
    reference = "https://arxiv.org/abs/1905.12265" 
)
gin_contextpred_model = dgllife.model.load_pretrained('gin_supervised_contextpred')


Downloading gin_supervised_contextpred_pre_trained.pth from https://data.dgl.ai/dgllife/pre_trained/gin_supervised_contextpred.pth...
Pretrained model loaded


In [9]:
# an example of JTVAE model on zinc
graphormer = ModelInfo(
    name = "pcqm4mv2_graphormer_base",
    inputs = "smiles",
    type="pretrained",
    group="graphormer",
    version=0,
    submitter="Datamol",
    description="Pretrained Graph Transformer on PCQM4Mv2 Homo-Lumo energy gap prediction using 2D molecular graphs.",
    representation="graph",
    require_3D=False,
    tags = ["Graphormer", "PCQM4Mv2", "graph", "pytorch", "Microsoft"],
    authors= ['Chengxuan Ying',
                'Tianle Cai',
                'Shengjie Luo',
                'Shuxin Zheng',
                'Guolin Ke',
                'Di He',
                'Yanming Shen',
                'Tie-Yan Liu'
    ],
    reference = "https://arxiv.org/abs/2106.05234" 
)

In [10]:
print(graphormer.usage())


import datamol as dm
from molfeat.trans.pretrained import GraphormerTransformer
data = dm.freesolv().iloc[:100]
transformer = GraphormerTransformer(kind='pcqm4mv2_graphormer_base')
features = transformer(data["smiles"])



In [11]:
# an example of JTVAE model on zinc
jtvae = ModelInfo(
    name = "jtvae_zinc_no_kl",
    inputs = "smiles",
    type="pretrained",
    group="dgllife",
    version=0,
    submitter="Datamol",
    description="A JTVAE pre-trained on ZINC for molecule generation, without KL regularization",
    representation="other",
    require_3D=False,
    tags = ["JTNN", "JTVAE", "dgl", "pytorch", "junction-tree", "graph"],
    authors= ["Wengong Jin", "Regina Barzilay", "Tommi Jaakkola"],
    reference = "https://arxiv.org/abs/1802.04364v4" 
)
# we load the jtvae model from dgllife pretrained to register on the store
jtvae_model = dgllife.model.load_pretrained('JTVAE_ZINC_no_kl')


Downloading JTVAE_ZINC_no_kl_pre_trained.pth from https://data.dgl.ai/pre_trained/jtvae_ZINC_no_kl.pth...
Pretrained model loaded


In [12]:
print(jtvae.usage())


import datamol as dm
from molfeat.trans.pretrained import PretrainedDGLTransformer
data = dm.freesolv().iloc[:100]
transformer = PretrainedDGLTransformer(kind='jtvae_zinc_no_kl')
features = transformer(data["smiles"])



### Model Store

In [13]:
from molfeat.store.modelstore import ModelStore

In [14]:
store = ModelStore()

In [15]:
store.available_models

[ModelInfo(name='gin_supervised_contextpred', inputs='smiles', type='pretrained', version=0, group='dgllife', submitter='Datamol', description='GIN neural network model pre-trained with supervised learning and context prediction on molecules from ChEMBL.', representation='graph', require_3D=False, tags=['GIN', 'dgl', 'pytorch', 'graph'], authors=['Weihua Hu', 'Bowen Liu', 'Joseph Gomes', 'Marinka Zitnik', 'Percy Liang', 'Vijay Pande', 'Jure Leskovec'], reference='https://arxiv.org/abs/1905.12265', created_at=datetime.datetime(2023, 2, 2, 19, 45, 57, 641213), sha256sum='5e23bc0926f85117456ee670aea6da91cedf25e1fe4a28c806b852c7b4ea5ceb'),
 ModelInfo(name='jtvae_zinc_no_kl', inputs='smiles', type='pretrained', version=0, group='dgllife', submitter='Datamol', description='A JTVAE pre-trained on ZINC for molecule generation, without KL regularization', representation='other', require_3D=False, tags=['JTNN', 'JTVAE', 'dgl', 'pytorch', 'junction-tree', 'graph'], authors=['Wengong Jin', 'Regina

#### Register models

In [16]:
# maccs is not a pretrained fingerpint
# we have no model to save
store.register(maccs, model=None, force=True)

  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-02 19:51:31.644 | INFO     | molfeat.store.modelstore:register:124 - Successfuly registered model maccs !


In [17]:
store.exists(maccs, check_remote=True)

True

In [18]:
store.register(gin_contextpred, model=gin_contextpred_model, force=True)

  0%|          | 0.00/7.12M [00:00<?, ?B/s]

2023-02-02 19:51:40.118 | INFO     | molfeat.store.modelstore:register:124 - Successfuly registered model gin_supervised_contextpred !


In [19]:
store.register(jtvae, model=jtvae_model, force=True)

  0%|          | 0.00/19.6M [00:00<?, ?B/s]

2023-02-02 19:51:55.866 | INFO     | molfeat.store.modelstore:register:124 - Successfuly registered model jtvae_zinc_no_kl !


In [20]:
# we don't need to save the graphormer model specifically
store.register(graphormer, model=None, force=True)

  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-02 19:51:57.991 | INFO     | molfeat.store.modelstore:register:124 - Successfuly registered model pcqm4mv2_graphormer_base !


In [21]:
import yaml
print("=======\n".join(yaml.dump(model.dict()) for model in store.available_models))

authors:
- Weihua Hu
- Bowen Liu
- Joseph Gomes
- Marinka Zitnik
- Percy Liang
- Vijay Pande
- Jure Leskovec
created_at: 2023-02-02 19:51:17.228390
description: GIN neural network model pre-trained with supervised learning and context
  prediction on molecules from ChEMBL.
group: dgllife
inputs: smiles
name: gin_supervised_contextpred
reference: https://arxiv.org/abs/1905.12265
representation: graph
require_3D: false
sha256sum: 72dc062936b78b515ed5d0989f909ab7612496d698415d73826b974c9171504a
submitter: Datamol
tags:
- GIN
- dgl
- pytorch
- graph
type: pretrained
version: 0
authors:
- Wengong Jin
- Regina Barzilay
- Tommi Jaakkola
created_at: 2023-02-02 19:51:20.468939
description: A JTVAE pre-trained on ZINC for molecule generation, without KL regularization
group: dgllife
inputs: smiles
name: jtvae_zinc_no_kl
reference: https://arxiv.org/abs/1802.04364v4
representation: other
require_3D: false
sha256sum: eab8ecb8a7542a8cdf97410cb27f72aaf374fefef6a1f53642cc5b310cf2b7f6
submitter: Datam

In [22]:
# we don't have this model saved
store.search(ecfp)

[]

In [23]:
# we do have this model saved
store.search(maccs)

[ModelInfo(name='maccs', inputs='smiles', type='hand-crafted', version=0, group='rdkit', submitter='Datamol', description='MACCS keys are 166-bit 2D structure fingerprints that are commonly used for the measure of molecular similarity. They described the presence of key features in molecular graphs', representation='vector', require_3D=False, tags=['maccs', 'fixed', '2D', 'binary', 'rdkit'], authors=['MDL Information Systems'], reference='https://doi.org/10.1021/ci010132r', created_at=datetime.datetime(2023, 2, 2, 19, 51, 10, 688803), sha256sum='9c298d589a2158eb513cb52191144518a2acab2cb0c04f1df14fca0f712fa4a1')]

In [24]:
store.search(name="jtvae_zinc_no_kl")

[ModelInfo(name='jtvae_zinc_no_kl', inputs='smiles', type='pretrained', version=0, group='dgllife', submitter='Datamol', description='A JTVAE pre-trained on ZINC for molecule generation, without KL regularization', representation='other', require_3D=False, tags=['JTNN', 'JTVAE', 'dgl', 'pytorch', 'junction-tree', 'graph'], authors=['Wengong Jin', 'Regina Barzilay', 'Tommi Jaakkola'], reference='https://arxiv.org/abs/1802.04364v4', created_at=datetime.datetime(2023, 2, 2, 19, 51, 20, 468939), sha256sum='eab8ecb8a7542a8cdf97410cb27f72aaf374fefef6a1f53642cc5b310cf2b7f6')]

In [25]:
maccs_model, maccs_info = store.load("maccs")

In [26]:
maccs_model # empty because it's a hand-crafted fingerprint

In [27]:
maccs_info

ModelInfo(name='maccs', inputs='smiles', type='hand-crafted', version=0, group='rdkit', submitter='Datamol', description='MACCS keys are 166-bit 2D structure fingerprints that are commonly used for the measure of molecular similarity. They described the presence of key features in molecular graphs', representation='vector', require_3D=False, tags=['maccs', 'fixed', '2D', 'binary', 'rdkit'], authors=['MDL Information Systems'], reference='https://doi.org/10.1021/ci010132r', created_at=datetime.datetime(2023, 2, 2, 18, 14, 6, 571706), sha256sum='9c298d589a2158eb513cb52191144518a2acab2cb0c04f1df14fca0f712fa4a1')

In [28]:
gin_reloaded_model, gin_reloaded_info = store.load("gin_supervised_contextpred")

  0%|          | 0.00/653 [00:00<?, ?B/s]

  0%|          | 0.00/7.12M [00:00<?, ?B/s]

In [29]:
gin_reloaded_model

GIN(
  (dropout): Dropout(p=0.5, inplace=False)
  (node_embeddings): ModuleList(
    (0): Embedding(120, 300)
    (1): Embedding(3, 300)
  )
  (gnn_layers): ModuleList(
    (0): GINLayer(
      (mlp): Sequential(
        (0): Linear(in_features=300, out_features=600, bias=True)
        (1): ReLU()
        (2): Linear(in_features=600, out_features=300, bias=True)
      )
      (edge_embeddings): ModuleList(
        (0): Embedding(6, 300)
        (1): Embedding(3, 300)
      )
      (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): GINLayer(
      (mlp): Sequential(
        (0): Linear(in_features=300, out_features=600, bias=True)
        (1): ReLU()
        (2): Linear(in_features=600, out_features=300, bias=True)
      )
      (edge_embeddings): ModuleList(
        (0): Embedding(6, 300)
        (1): Embedding(3, 300)
      )
      (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): GI

In [32]:
print(gin_reloaded_info.usage())


import datamol as dm
from molfeat.trans.pretrained import PretrainedDGLTransformer
data = dm.freesolv().iloc[:100]
transformer = PretrainedDGLTransformer(kind='gin_supervised_contextpred')
features = transformer(data["smiles"])



In [31]:
exec(gin_reloaded_info.usage())
features

array([[ 5.0801709e-03, -1.5467817e-01, -1.2526581e-01, ...,
         2.3464283e-01, -7.5401053e-02,  2.0824190e-02],
       [-7.0198812e-04, -8.1730559e-02, -6.2045324e-01, ...,
        -2.3603138e-01,  7.3010635e-01,  3.8331217e-01],
       [ 2.7388150e-02,  1.8647036e-01, -2.4399137e-01, ...,
         5.7929408e-02,  1.7175178e-01, -1.7541242e-01],
       ...,
       [ 2.5242649e-02,  1.7176455e-01, -1.3434565e-01, ...,
         1.2988050e-01,  2.0930676e-01, -1.2900873e-01],
       [-2.7901845e-03, -1.3657156e-01,  4.3633554e-02, ...,
         2.1721300e-01,  1.0626764e-01,  1.2404752e-01],
       [-1.5046312e-02,  4.4183634e-02,  1.9645907e-01, ...,
         4.8413888e-02, -2.6770476e-01,  2.4833913e-01]], dtype=float32)

---------

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from molfeat.store.modelstore import ModelStore

In [4]:
store = ModelStore()

In [5]:
chemberta_card = store.search(name="DeepChem-ChemBERTa-77M-MLM")

In [6]:
print(chemberta_card[0].usage())


import datamol as dm
from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer
data = dm.freesolv().iloc[:100]
transformer = PretrainedHFTransformer(kind='DeepChem-ChemBERTa-77M-MLM', notation='smiles')
features = transformer(data["smiles"])



In [7]:
import datamol as dm
from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer
data = dm.freesolv().iloc[:100]
transformer = PretrainedHFTransformer(kind='DeepChem-ChemBERTa-77M-MLM', notation='smiles')
features = transformer(data["smiles"])

Some weights of the model checkpoint at /Users/manu/Library/Caches/molfeat/DeepChem-ChemBERTa-77M-MLM/model.save were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /Users/manu/Library/Caches/molfeat/DeepChem-ChemBERTa-77M-MLM/model.save and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
features

array([[-0.08768422, -0.07157033, -0.04022325, ...,  0.10310469,
         0.16929244, -0.13035022],
       [ 0.10688866,  0.10707027,  0.23671532, ...,  0.01356442,
         0.09901753, -0.18887427],
       [ 0.18590733, -0.3194937 , -0.16546208, ...,  0.16675094,
         0.12800565, -0.1350749 ],
       ...,
       [-0.00189916, -0.3090676 , -0.11365637, ...,  0.17380431,
        -0.08730147, -0.4555515 ],
       [ 0.18555567, -0.1301071 , -0.17172852, ...,  0.08563311,
        -0.00947405, -0.3082643 ],
       [-0.10831349, -0.08913019, -0.02071786, ...,  0.12807605,
         0.2750607 , -0.11662877]], dtype=float32)