In [1]:
import pandas as pd

from lightning import pytorch as pl

from chemprop.v2 import data
from chemprop.v2 import featurizers
from chemprop.v2.models import modules, models, metrics, multi

NameError: name 'OutputTransform' is not defined

# Load data

In [None]:
# change your data inputs here
input_path = '../tests/data/regression_multimolecule.csv' # path to your data .csv file containing SMILES strings and target values
smiles_columns = ['smiles', 'solvent'] # name of the column containing SMILES strings
target_columns = ['peakwavs_max'] # list of names of the columns containing targets

In [None]:
# load data
df_input = pd.read_csv(input_path)
df_input

Unnamed: 0,smiles,solvent,peakwavs_max
0,CCCCN1C(=O)C(=C/C=C/C=C/C=C2N(CCCC)c3ccccc3N2C...,ClCCl,642.0
1,C(=C/c1cnccn1)\c1ccc(N(c2ccccc2)c2ccc(/C=C/c3c...,ClCCl,420.0
2,CN(C)c1ccc2c(-c3ccc(N)cc3C(=O)[O-])c3ccc(=[N+]...,O,544.0
3,c1ccc2[nH]ccc2c1,O,290.0
4,CCN(CC)c1ccc2c(c1)OC1=C(/C=C/C3=[N+](C)c4ccc5c...,ClC(Cl)Cl,736.0
...,...,...,...
495,CCC(=O)c1ccc2cc(N(C)C)ccc2c1,Clc1ccccc1,254.0
496,O=Cc1cc(O)c(C=O)cc1O,CN(C)P(=O)(N(C)C)N(C)C,427.0
497,Cc1ccc(C2=C3C=c4c(oc5ccccc45)=[N+]3[B-](F)(F)n...,CC#N,578.0
498,Cc1ccc(-c2nc(-c3cc([N+](=O)[O-])ccc3O)[nH]c2-c...,CO,284.0


In [None]:
# get SMILES and targets
smiss = df_input.loc[:, smiles_columns].values
ys = df_input.loc[:, target_columns].values

In [None]:
smiss[:5], ys[:5]

(array([['CCCCN1C(=O)C(=C/C=C/C=C/C=C2N(CCCC)c3ccccc3N2CCCC)C(=O)N(CCCC)C1=S',
         'ClCCl'],
        ['C(=C/c1cnccn1)\\c1ccc(N(c2ccccc2)c2ccc(/C=C/c3cnccn3)cc2)cc1',
         'ClCCl'],
        ['CN(C)c1ccc2c(-c3ccc(N)cc3C(=O)[O-])c3ccc(=[N+](C)C)cc-3oc2c1',
         'O'],
        ['c1ccc2[nH]ccc2c1', 'O'],
        ['CCN(CC)c1ccc2c(c1)OC1=C(/C=C/C3=[N+](C)c4ccc5ccccc5c4C3(C)C)CCCC1=C2c1ccccc1C(=O)O',
         'ClC(Cl)Cl']], dtype=object),
 array([[642.],
        [420.],
        [544.],
        [290.],
        [736.]]))

In [None]:
# make list of molecule datapoints. 
all_data = [[data.MoleculeDatapoint.from_smi(smis[0], y) for smis, y in zip(smiss, ys)]]
all_data += [[data.MoleculeDatapoint.from_smi(smis[i]) for smis in smiss] for i in range(1, len(smiles_columns))]


# Split data

In [None]:
# change your data splitting inputs here
split_key_molecule_column = 'smiles' # key molecule used for splitting
split = 'random' # type of split
sizes = (0.8, 0.1, 0.1) # sizes of train, validation, and test sets

In [None]:
# available split types
list(data.SplitType.keys())

['CV_NO_VAL',
 'CV',
 'SCAFFOLD_BALANCED',
 'RANDOM_WITH_REPEATED_SMILES',
 'RANDOM',
 'KENNARD_STONE',
 'KMEANS']

In [None]:
# split data based on key molecule
split_key_molecule_index = smiles_columns.index(split_key_molecule_column)
train_data, val_data, test_data, train_val_test_indices = data.split_data(all_data[split_key_molecule_index], split=split, sizes=sizes)
train_dataset = data.MoleculeDataset(train_data)
val_dataset = data.MoleculeDataset(val_data)
test_dataset = data.MoleculeDataset(test_data)

In [None]:
# apply same split to other molecules
train_datasets, val_datasets, test_datasets = [train_dataset], [val_dataset], [test_dataset]
for i in range(len(all_data)):
    if i != split_key_molecule_index:
        train_data, val_data, test_data, _ = data.split_data(all_data[i], train_val_test_indices=train_val_test_indices)
        train_dataset = data.MoleculeDataset(train_data)
        val_dataset = data.MoleculeDataset(val_data)
        test_dataset = data.MoleculeDataset(test_data)
        train_datasets.append(train_dataset)
        val_datasets.append(val_dataset)
        test_datasets.append(test_dataset)

In [None]:
train_mcdset = data.MulticomponentDataset(train_datasets)
scaler = train_mcdset.normalize_targets()

val_mcdset = data.MulticomponentDataset(val_datasets)
val_mcdset.normalize_targets(scaler)
test_mcdset = data.MulticomponentDataset(test_datasets)
test_mcdset.normalize_targets(scaler)


In [None]:
train_loader = data.MolGraphDataLoader(train_mcdset)
val_loader = data.MolGraphDataLoader(val_mcdset, shuffle=False)
test_loader = data.MolGraphDataLoader(test_mcdset, shuffle=False)

In [None]:
mcmp = modules.MulticomponentMessagePassing(
    blocks=[modules.BondMessageBlock() for i in range(len(smiles_columns))],
    n_components=len(smiles_columns),
)

In [None]:
agg = modules.MeanAggregation()

In [None]:
ffn = modules.RegressionFFN(
    loc=scaler.mean_, # pass in the mean of the training targets
    scale=scaler.scale_, # pass in the scale of the training targets
)

In [None]:
metric_list = [metrics.RMSEMetric(), metrics.MAEMetric()] # Only the first metric is used for training and early stopping

In [None]:
mcmpnn = multi.MulticomponentMPNN(
    mcmp,
    agg,
    ffn,
    metrics=metric_list,
)

mcmpnn

AttributeError: 'MulticomponentMessagePassing' object has no attribute 'hparams'