In [1]:
import rdkit
from rdkit import Chem
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, balanced_accuracy_score

In [21]:
from miprop.utils.dataset import Dataset

from miprop.conformer.rdkit import RDKitConformerGenerator

from miprop.descriptor.descriptor_3d.rdkit import RDKitGENERAL3D
from miprop.descriptor.descriptor_3d.molfeat import PharmacophoreDescriptor3D

from miprop.descriptor.descriptor_2d.molfeat import ECFP

from miprop.utils.scaler import BagMinMaxScaler

from miprop.mil.network.regressor import AttentionNetworkRegressor
from miprop.mil.wrapper.regressor import InstanceWrapperRegressor
from sklearn.ensemble import RandomForestRegressor

from miprop.utils.pipeline import Pipeline

from sklearn.model_selection import train_test_split

## Load data

1. Free dataset structure (data_col, prop_col)
2. Dataset descriptive statistics (num_mol, mol_weight, num_rot_bonds, etc.)

In [14]:
dataset = Dataset('../datasets_bench/solubility.csv')

In [15]:
dataset.get_molecules()

[<rdkit.Chem.rdchem.Mol at 0x7f41b1747d10>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1747d80>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1747df0>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1747e60>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1747ed0>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1747f40>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298040>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b12980b0>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298120>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298190>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298200>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298270>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b12982e0>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298350>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b12983c0>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298430>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b12984a0>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298510>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298580>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b12985f0>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b1298660>,
 <rdkit.Chem.rdchem.Mol at 0x7f41b12986d0>,
 <rdkit.Chem.rdchem.Mol at 0x7f4

In [16]:
dataset.get_labels()

[-0.7546423143000001,
 -4.9505314842,
 -2.3925,
 -0.4121,
 -3.2502,
 -0.7165256146,
 -7.861382650099999,
 -2.43,
 -5.4154147676,
 -1.18,
 -0.6571,
 -1.7293168462,
 -4.635481621399999,
 -6.0994494699,
 -6.733128222,
 1.1189,
 -4.7,
 -4.02,
 -3.5580403929,
 -2.22,
 1.0124096986,
 -5.66,
 -4.6613,
 -7.532414871799999,
 -4.6,
 -2.65,
 -0.1822,
 -1.7922611704,
 -0.714875651,
 -3.3575,
 -4.2084,
 -1.7603,
 -1.0169828584,
 -7.5255403409,
 -2.3151,
 -7.724189002,
 -5.549815883300001,
 -7.3999,
 -6.2463311334,
 -8.1979,
 -2.6099,
 -2.46,
 -2.0798,
 -1.7720804309,
 -4.91,
 -1.78,
 0.0571886956,
 -3.0929,
 -6.7054442976,
 -1.0,
 -4.7672,
 -0.823,
 -3.49,
 0.4173208759,
 -5.0994877858,
 -4.1227687236,
 -2.8583,
 0.7899,
 -7.516526119400001,
 -1.2449,
 -7.253899897899999,
 0.1216,
 -1.4996,
 -4.26,
 -0.95,
 -0.8598992117,
 -7.91,
 -5.1905,
 -6.926267992300001,
 -3.6202,
 -5.458398412699999,
 -3.98,
 -2.2577490108,
 -3.77,
 -3.8955,
 0.0167319924,
 -3.78,
 0.5413,
 -7.3387,
 -2.83,
 -7.9318475776,
 

## Create model building pipeline

In [17]:
mols = dataset.get_molecules()
props = dataset.get_labels()

In [22]:
mols_train, mols_test, props_train, props_test = train_test_split(mols, props, test_size=0.2, random_state=42)

In [23]:
pipeline = Pipeline([RDKitConformerGenerator(num_conf=10, e_thresh=50), 
                     RDKitGENERAL3D(), 
                     BagMinMaxScaler(), 
                     AttentionNetworkRegressor()])

In [24]:
pipeline.fit(mols_train, props_train)

[H]/C(=C(\[H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C(=O)OC([H])([H])C([H])(OC(=O)C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])/C([H])=C(/[H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H])C([H])([H])OC(=O)C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])/C([H])=C(/[H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H]  Conformer generation failed


In [26]:
pipeline.predict(mols_test)

[H]C([H])([H])C([H])([H])C(C([H])([H])OC(=O)C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H])(C([H])([H])OC(=O)C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H])C([H])([H])OC(=O)C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H]  Conformer generation failed


array([[-5.131189  ],
       [-3.852463  ],
       [-2.634476  ],
       [-4.942649  ],
       [-4.4577675 ],
       [-4.902735  ],
       [-3.1419814 ],
       [-2.4179146 ],
       [-2.2871845 ],
       [-4.133452  ],
       [-4.092112  ],
       [-1.894766  ],
       [-0.74472916],
       [-0.96547353],
       [-3.878848  ],
       [-3.6810474 ],
       [-1.8009349 ],
       [-3.3143067 ],
       [-4.902735  ],
       [-8.386013  ],
       [-3.9786255 ],
       [-1.6722375 ],
       [-6.1620955 ],
       [-2.192889  ],
       [-6.0040636 ],
       [-1.382227  ],
       [-5.179525  ],
       [-4.403491  ],
       [-5.100208  ],
       [-0.04259381],
       [-2.6287313 ],
       [-1.8821027 ],
       [-4.1534514 ],
       [-1.3282653 ],
       [-4.383201  ],
       [-1.8680668 ],
       [-4.2354474 ],
       [-4.1504154 ],
       [-5.6651835 ],
       [-4.559854  ],
       [-3.7501795 ],
       [-5.050859  ],
       [-4.669375  ],
       [-5.455565  ],
       [-2.2667048 ],
       [-4