# Prepare Input Data
We are going to train an MPNN to predict the B3LYP-level atomization energy.

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from moldesign.score.mpnn.data import make_tfrecord
from moldesign.utils.conversions import convert_string_to_dict
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import tensorflow as tf
import pandas as pd
import json



## Get the Data
It is stored on a [GitHub page](https://github.com/globus-labs/g4mp2-atomization-energy) from a previous project

In [2]:
data = pd.read_json('https://github.com/globus-labs/g4mp2-atomization-energy/raw/master/data/output/g4mp2_data.json.gz', 
                   lines=True)
print(f'Downloaded {len(data)} training entries')

Downloaded 130258 training entries


## Save the Data as TF Records
We'll make both a training, validation and test set

In [3]:
test_set = data.query('in_holdout')
print(f'Set aside {len(test_set)} training entries')

Set aside 13026 training entries


In [4]:
train_set, val_set = train_test_split(data.query('not in_holdout'), test_size=0.1, random_state=1)
print(f'Split off {len(train_set)} training and {len(val_set)} validation entries')

Split off 105508 training and 11724 validation entries


Save the data in TF format

In [None]:
for name, dataset in zip(['train', 'valid', 'test'], [train_set, val_set, test_set]):
    dataset = dataset.sample(frac=1.)  # Shuffle contents
    with tf.io.TFRecordWriter(f'{name}_data.proto') as writer:
        for _, entry in tqdm(dataset.iterrows(), desc=name):
            record = convert_string_to_dict(entry['smiles_0'])
            for output in ['u0_atom', 'g4mp2_atom']:
                record[output] = entry[output]
            writer.write(make_tfrecord(record))

train: 937it [00:00, 2120.51it/s]