# 1. Load dataset

In [1]:
import pandas as pd
from rdkit import Chem

In [2]:
data_train = pd.read_csv('CHEMBL1862_Ki/train.csv', header=None)
data_test = pd.read_csv('CHEMBL1862_Ki/test.csv', header=None)

In [3]:
smi_train, prop_train = data_train[0].to_list(), data_train[1].to_list()
smi_test, prop_test = data_test[0].to_list(), data_test[1].to_list()

In [4]:
mols_train, y_train = [], []
for smi, prop in zip(smi_train, prop_train):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        mols_train.append(mol)
        y_train.append(prop)

In [5]:
mols_test, y_test = [], []
for smi, prop in zip(smi_test, prop_test):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        mols_test.append(mol)
        y_test.append(prop)

## 1.5 Reduce the dataset size for faster pipeline reproduction (for playing around)

In [6]:
mols_train, y_train = mols_train[:30], y_train[:30]
mols_test, y_test = mols_test[:10], y_test[:10]

## 2. Conformer generation

In [7]:
from miprop.conformer.rdkit import RDKitConformerGenerator

from miprop.utils.logging import FailedMolecule, FailedConformer, FailedDescriptor

In [8]:
conf_gen = RDKitConformerGenerator(num_conf=20, e_thresh=50, num_cpu=10)

In [9]:
confs_train = conf_gen.generate(mols_train)
confs_test = conf_gen.generate(mols_test)

In [10]:
for n, conf in enumerate(confs_train):
    if isinstance(conf, FailedConformer):
        print(f'Conformer generation failed for training molecule number {n}')

for n, conf in enumerate(confs_test):
    if isinstance(conf, FailedConformer):
        print(f'Conformer generation failed for test molecule number {n}')

# 3. Descriptor calculation

In [11]:
from miprop.descriptor.descriptor_3d.rdkit import (RDKitGEOM, 
                                                   RDKitAUTOCORR, 
                                                   RDKitMORSE, 
                                                   RDKitGETAWAY, 
                                                   RDKitRDF, 
                                                   RDKitWHIM)

from miprop.descriptor.descriptor_3d.molfeat import (MolFeatPharmacophore, 
                                                     MolFeatUSRD, 
                                                     MolFeatElectroShape)

In [12]:
descr_calc = MolFeatPharmacophore()

In [13]:
x_train = descr_calc.transform(confs_train)
x_test = descr_calc.transform(confs_test)

In [14]:
for n, x in enumerate(x_train):
    if isinstance(conf, FailedDescriptor):
        print(f'Descriptor calculation failed for training molecule number {n}')

for n, x in enumerate(x_test):
    if isinstance(conf, FailedDescriptor):
        print(f'Descriptor calculation failed for test molecule number {n}')

In [15]:
x_train[0].shape

(20, 2048)

## 4. Model training

In [16]:
from miprop.mil.network.regressor import (AttentionNetworkRegressor,
                                          SelfAttentionNetworkRegressor,
                                          GatedAttentionNetworkRegressor,
                                          TemperatureAttentionNetworkRegressor,
                                          GumbelAttentionNetworkRegressor,
                                          GlobalTemperatureAttentionNetworkRegressor,
                                          DynamicPoolingNetworkRegressor,
                                          GaussianPoolingNetworkRegressor,
                                          InstanceNetworkRegressor,
                                          BagNetworkRegressor)

from miprop.mil.wrapper.regressor import (InstanceWrapperRegressor, 
                                          BagWrapperRegressor)

from miprop.utils.scaler import BagMinMaxScaler

from sklearn.metrics import r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [17]:
scaler = BagMinMaxScaler()

scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [18]:
hparams = {
           'hidden_layer_sizes':(256, 128, 64),
           'num_epoch':300,
           'batch_size':128,
           'learning_rate':0.001,
           'weight_decay':0.001,
           'instance_weight_dropout':0.1,
           'init_cuda':True,
           'verbose':False}

In [19]:
model = AttentionNetworkRegressor(**hparams)
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)

print(r2_score(y_test, y_pred))

0.49830859899520874


In [20]:
model = DynamicPoolingNetworkRegressor(**hparams)
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)

print(r2_score(y_test, y_pred))

0.5815784931182861


In [21]:
model = InstanceWrapperRegressor(MLPRegressor())
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)

print(r2_score(y_test, y_pred))

0.34112732395979206


## 5. Predict instance weights

In [22]:
model = AttentionNetworkRegressor(instance_weight_dropout=0.01)
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)
w_pred = model.get_instance_weights(x_test_scaled)

w_pred[1].round(2)

array([0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05,
       0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05],
      dtype=float32)

In [23]:
model = AttentionNetworkRegressor(instance_weight_dropout=0.90)
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)
w_pred = model.get_instance_weights(x_test_scaled)

w_pred[1].round(2)

array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0.5, 0. , 0. ], dtype=float32)

In [24]:
model = DynamicPoolingNetworkRegressor(instance_weight_dropout=0.90)
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)
w_pred = model.get_instance_weights(x_test_scaled)

w_pred[1].round(2)

array([0.02, 0.12, 0.09, 0.04, 0.05, 0.08, 0.07, 0.02, 0.07, 0.04, 0.01,
       0.05, 0.06, 0.03, 0.02, 0.02, 0.03, 0.11, 0.05, 0.02],
      dtype=float32)