# Import Packages

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split

from chemprop import data, featurizers

# Change data inputs here

In [None]:
test_path = '../tests/data/regression/mol.csv'  # path to your data .csv file containing SMILES strings, target values and features.

## Load data

In [None]:
df_test = pd.read_csv(test_path)
df_test

 Next, we will compute a couple additional molecular features using RDKit

In [None]:
df_test['mw'] = df_test['smiles'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
df_test['qed'] = df_test['smiles'].apply(lambda x: Descriptors.qed(Chem.MolFromSmiles(x)))
df_test

In [None]:
smiles_column = 'smiles'  # name of the column containing SMILES strings.
target_columns = ['lipo']  # list of names of the columns containing targets.
mol_features_columns = ['mw','qed']  # list of names of the columns containing features.

## Get SMILES, targets and features

In [None]:
smis = df_test.loc[:, smiles_column].values
ys = df_test.loc[:, target_columns].values
fs = df_test.loc[:, mol_features_columns].values

smis[:5], ys[:5], fs[:5]

In [None]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y=y, x_f=f) for smi, y, f in zip(smis, ys, fs)]
all_data

## Get Molecule Datasets

In [None]:
train_data, val_test_data = train_test_split(all_data, test_size=0.1)
val_data, test_data = train_test_split(val_test_data, test_size=0.5)

In [None]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data, featurizer)
scaler = train_dset.normalize_targets()

val_dset = data.MoleculeDataset(val_data, featurizer)
val_dset.normalize_targets(scaler)
test_dset = data.MoleculeDataset(test_data, featurizer)
test_dset.normalize_targets(scaler)