# Prepare Input Data
We are going to train an MPNN to predict the B3LYP-level atomization energy.

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from moldesign.score.mpnn.data import convert_nx_to_dict, make_type_lookup_tables, make_tfrecord
from moldesign.utils.conversions import convert_smiles_to_nx
from moldesign.store.mongo import MoleculePropertyDB
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import tensorflow as tf
import pandas as pd
import json
import os



## Get the Data
The best copy is in our MongoDB isntance

In [2]:
mongo = MoleculePropertyDB.from_connection_info()

Get a mapping of smiles to "normal basis"

In [3]:
data = mongo.get_training_set(['key', 'identifier.smiles', 'data.xtb.neutral.xyz', 'data.small_basis.neutral.xyz', 'oxidation_potential.xtb-vacuum'], 
                              ['oxidation_potential.smb-vacuum'])
data = pd.DataFrame(data)
print(f'Loaded {len(data)} molecules')

Loaded 2563 molecules


Convert the SMILES to a networkx object

In [4]:
%%time
data['nx'] = data['identifier.smiles'].apply(lambda x: convert_smiles_to_nx(x, add_hs=True))

CPU times: user 562 ms, sys: 12.7 ms, total: 574 ms
Wall time: 576 ms


## Save the Data as TF Records
We'll make both a training, validation and test set

In [5]:
train_set, test_set = train_test_split(data, test_size=0.1, random_state=1, shuffle=True)
print(f'Set aside {len(test_set)} training entries')

Set aside 257 training entries


In [6]:
train_set, val_set = train_test_split(train_set, test_size=0.1, random_state=1, shuffle=True)
print(f'Split off {len(train_set)} training and {len(val_set)} validation entries')

Split off 2075 training and 231 validation entries


Get all of the types observed in the dataset

In [7]:
%%time
atom_types, bond_types = make_type_lookup_tables(data['nx'])

CPU times: user 41.5 ms, sys: 0 ns, total: 41.5 ms
Wall time: 41.3 ms


In [8]:
with open('atom_types.json', 'w') as fp:
    json.dump(atom_types, fp)
with open('bond_types.json', 'w') as fp:
    json.dump(bond_types, fp)

Save the data in TF format

In [9]:
os.makedirs('datasets', exist_ok=True)

In [10]:
for name, dataset in zip(['train', 'valid', 'test'], [train_set, val_set, test_set]):
    dataset = dataset.sample(frac=1.)  # Shuffle contents
    dataset.drop(columns=['nx']).to_csv(f'datasets/{name}.csv', index=False)
    with tf.io.TFRecordWriter(f'datasets/{name}_data.proto') as writer:
        for _, entry in tqdm(dataset.iterrows(), desc=name):
            record = convert_nx_to_dict(entry['nx'], atom_types, bond_types)
            record['output'] = entry['oxidation_potential.smb-vacuum']
            writer.write(make_tfrecord(record))

train: 2075it [00:00, 5452.84it/s]
valid: 231it [00:00, 5525.62it/s]
test: 257it [00:00, 5529.89it/s]
