# Import Packages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from chemprop import data, featurizers

# Change data inputs here

In [2]:
test_path = 'tests/data/test_smiles.csv'  # path to your data .csv file containing SMILES strings, 
                                          # target values and features.
smiles_column = 'smiles'  # name of the column containing SMILES strings.
target_columns = ['targets']  # list of names of the columns containing targets.
mol_features_columns = ['features']  # list of names of the columns containing features.

## Load data

In [3]:
df_test = pd.read_csv(test_path)
df_test

Unnamed: 0,smiles,features,targets
0,CCOP(=S)(OCC)SC(CCl)N2C(=O)c1ccccc1C2=O,1,4
1,Brc1ccc(Br)cc1,0,3
2,Cn2c(=O)on(c1ccc(Cl)c(Cl)c1)c2=O,0,3
3,Oc1ccc(cc1)c2ccccc2,1,4
4,CC1=C(CCCO1)C(=O)Nc2ccccc2,1,3
5,CCOC=C,1,4
6,CCC#C,0,3
7,COc1ncnc2nccnc12,1,4
8,CCCCC(C)(O)CC,0,3
9,Clc1ccc(Cl)cc1,1,4


## Get SMILES, targets and features

In [5]:
smis = df_test.loc[:, smiles_column].values
ys = df_test.loc[:, target_columns].values
fs = df_test.loc[:, mol_features_columns].values

smis[:5], ys[:5], fs[:5]

(array(['CCOP(=S)(OCC)SC(CCl)N2C(=O)c1ccccc1C2=O', 'Brc1ccc(Br)cc1',
        'Cn2c(=O)on(c1ccc(Cl)c(Cl)c1)c2=O', 'Oc1ccc(cc1)c2ccccc2',
        'CC1=C(CCCO1)C(=O)Nc2ccccc2'], dtype=object),
 array([[4],
        [3],
        [3],
        [4],
        [3]]),
 array([[1],
        [0],
        [0],
        [1],
        [1]]))

In [6]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y=y, x_f=f) for smi, y, f in zip(smis, ys, fs)]
all_data

[MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x29f3b6260>, y=array([4]), weight=1, gt_mask=None, lt_mask=None, x_f=array([1]), x_phase=None, V_f=None, E_f=None, V_d=None),
 MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x29f3b63b0>, y=array([3]), weight=1, gt_mask=None, lt_mask=None, x_f=array([0]), x_phase=None, V_f=None, E_f=None, V_d=None),
 MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x29f3b6570>, y=array([3]), weight=1, gt_mask=None, lt_mask=None, x_f=array([0]), x_phase=None, V_f=None, E_f=None, V_d=None),
 MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x29f3b6650>, y=array([4]), weight=1, gt_mask=None, lt_mask=None, x_f=array([1]), x_phase=None, V_f=None, E_f=None, V_d=None),
 MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x29f3b6730>, y=array([3]), weight=1, gt_mask=None, lt_mask=None, x_f=array([1]), x_phase=None, V_f=None, E_f=None, V_d=None),
 MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x29f3b6810>, y=array([4]), w

## Get Molecule Datasets

In [7]:
train_data, val_test_data = train_test_split(all_data, test_size=0.1)
val_data, test_data = train_test_split(val_test_data, test_size=0.5)

In [8]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data, featurizer)
scaler = train_dset.normalize_targets()

val_dset = data.MoleculeDataset(val_data, featurizer)
val_dset.normalize_targets(scaler)
test_dset = data.MoleculeDataset(test_data, featurizer)
test_dset.normalize_targets(scaler)