In [1]:
%load_ext autoreload
%autoreload 2

## Pretrained Featurizers

Aside from computed featurizers, `molfeat` also offers some pretrained featurizers. 

All pretrained featurizers inherits from `molfeat.trans.pretrained.base.PretrainedMolTransformer` and most of them are based on models hosted on `ada`

In [2]:
import datamol as dm
import random
import numpy as np

# set printing option
np.set_printoptions(threshold=10)

# set random list
np.random.seed(10)
random.seed(10)

data = dm.data.freesolv().sample(500)

In [3]:
import torch
from molfeat.trans.pretrained import FCDTransformer
from molfeat.trans.pretrained import PretrainedDGLTransformer

In [4]:
fcd = FCDTransformer(dtype="df")

In [5]:
fcd(data["smiles"], enforce_dtype=True)[0]

0     -0.104110
1      0.135386
2      0.117059
3      0.103226
4     -0.259087
         ...   
495    0.053794
496    0.124357
497   -0.183770
498    0.171897
499   -0.055931
Name: 0, Length: 500, dtype: float32

In [6]:
gin = PretrainedDGLTransformer(kind="gin_supervised_contextpred", dtype=np.float32, preload=True, pooling="sum")
gin(data["smiles"], enforce_dtype=True)[0]

array([ 0.25235158,  0.8971151 , -0.58575034, ...,  0.23859033,
       -0.4665837 , -1.3065432 ], dtype=float32)

In [8]:
gin.set_params(kind="gin_supervised_contextpred", pooling="sum")

## Testing the pretrained fingerprint with sklearn model

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [10]:
df = dm.data.freesolv()
X, y = df["smiles"], df["expt"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [13]:
gin = PretrainedDGLTransformer(kind="gin_supervised_contextpred",
    dtype=np.float32, preload=True, pooling="sum", verbose=False
)
pipe = Pipeline(
    [("feat", gin), ("scaler", StandardScaler()), ("rf", RandomForestRegressor(n_estimators=100))]
)
param_grid = dict(feat__pooling=["max", "mean"])
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=2)

grid_search.fit(X_train, y_train)

In [13]:
grid_search.best_estimator_[0]

PretrainedDGLTransformer(kind="gin_contextpred", pooling="max", dtype=np.float32)

In [14]:
grid_search.score(X_test, y_test)

0.7583147376911157