# Import Packages

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split

from chemprop import data, featurizers

# Change data inputs here

In [2]:
test_path = '../tests/data/regression/mol.csv'  # path to your data .csv file containing SMILES strings, target values and features.

## Load data

In [3]:
df_test = pd.read_csv(test_path)
df_test

Unnamed: 0,smiles,lipo
0,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14,3.54
1,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...,-1.18
2,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl,3.69
3,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...,3.37
4,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...,3.10
...,...,...
95,CC(C)N(CCCNC(=O)Nc1ccc(cc1)C(C)(C)C)C[C@H]2O[C...,2.20
96,CCN(CC)CCCCNc1ncc2CN(C(=O)N(Cc3cccc(NC(=O)C=C)...,2.04
97,CCSc1c(Cc2ccccc2C(F)(F)F)sc3N(CC(C)C)C(=O)N(C)...,4.49
98,COc1ccc(Cc2c(N)n[nH]c2N)cc1,0.20


 Next, we will compute a couple additional molecular features using RDKit

In [4]:
df_test['mw'] = df_test['smiles'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
df_test['qed'] = df_test['smiles'].apply(lambda x: Descriptors.qed(Chem.MolFromSmiles(x)))
df_test

Unnamed: 0,smiles,lipo,mw,qed
0,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14,3.54,340.858,0.728444
1,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...,-1.18,494.591,0.545587
2,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl,3.69,321.829,0.807761
3,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...,3.37,419.890,0.506650
4,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...,3.10,381.480,0.747686
...,...,...,...,...
95,CC(C)N(CCCNC(=O)Nc1ccc(cc1)C(C)(C)C)C[C@H]2O[C...,2.20,540.669,0.256099
96,CCN(CC)CCCCNc1ncc2CN(C(=O)N(Cc3cccc(NC(=O)C=C)...,2.04,656.615,0.139020
97,CCSc1c(Cc2ccccc2C(F)(F)F)sc3N(CC(C)C)C(=O)N(C)...,4.49,456.555,0.482692
98,COc1ccc(Cc2c(N)n[nH]c2N)cc1,0.20,218.260,0.721007


In [5]:
smiles_column = 'smiles'  # name of the column containing SMILES strings.
target_columns = ['lipo']  # list of names of the columns containing targets.
mol_features_columns = ['mw','qed']  # list of names of the columns containing features.

## Get SMILES, targets and features

In [6]:
smis = df_test.loc[:, smiles_column].values
ys = df_test.loc[:, target_columns].values
fs = df_test.loc[:, mol_features_columns].values

smis[:5], ys[:5], fs[:5]

(array(['Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14',
        'COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23',
        'COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl',
        'OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(Cl)sc4[nH]3',
        'Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)NCC#N)c1'],
       dtype=object),
 array([[ 3.54],
        [-1.18],
        [ 3.69],
        [ 3.37],
        [ 3.1 ]]),
 array([[340.858     ,   0.72844422],
        [494.591     ,   0.54558681],
        [321.829     ,   0.80776119],
        [419.89      ,   0.50664956],
        [381.48      ,   0.74768647]]))

In [7]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y=y, x_f=f) for smi, y, f in zip(smis, ys, fs)]
all_data

[MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x7f339279aff0>, y=array([3.54]), weight=1, gt_mask=None, lt_mask=None, x_f=array([340.858     ,   0.72844422]), x_phase=None, V_f=None, E_f=None, V_d=None),
 MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x7f339279b290>, y=array([-1.18]), weight=1, gt_mask=None, lt_mask=None, x_f=array([494.591     ,   0.54558681]), x_phase=None, V_f=None, E_f=None, V_d=None),
 MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x7f339279b060>, y=array([3.69]), weight=1, gt_mask=None, lt_mask=None, x_f=array([321.829     ,   0.80776119]), x_phase=None, V_f=None, E_f=None, V_d=None),
 MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x7f339279b140>, y=array([3.37]), weight=1, gt_mask=None, lt_mask=None, x_f=array([419.89      ,   0.50664956]), x_phase=None, V_f=None, E_f=None, V_d=None),
 MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x7f339279b1b0>, y=array([3.1]), weight=1, gt_mask=None, lt_mask=None, x_f=array([381.4

## Get Molecule Datasets

In [8]:
train_data, val_test_data = train_test_split(all_data, test_size=0.1)
val_data, test_data = train_test_split(val_test_data, test_size=0.5)

In [9]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data, featurizer)
scaler = train_dset.normalize_targets()

val_dset = data.MoleculeDataset(val_data, featurizer)
val_dset.normalize_targets(scaler)
test_dset = data.MoleculeDataset(test_data, featurizer)
test_dset.normalize_targets(scaler)