# Compute Representations
Generate QML representations for the training and test data

In [1]:
from jcesr_ml.utils import compute_atomization_energy
from jcesr_ml.benchmark import load_benchmark_data
from ase.io.xyz import read_xyz
from qml.data import Compound
from io import StringIO
import pickle as pkl
import pandas as pd
import numpy as np

Numbers to change

In [2]:
max_size = 30  # Maximum size of molecules

Get the training and test data

In [3]:
train_data, test_data = load_benchmark_data()

## Compute the Representations
We are going to use the [FCHL representation](https://doi.org/10.1063/1.5020710) to describe each molecule. With `qml`, you must pre-compute some bits about each molecule before you can use them on arbitrary molecules. Before computing the represetnation, we need to convert the XYZ file to QML format. This representation is kind of large, so we don't store it.

In [4]:
def compute_rep(x):
    """Generates representation and returns the values"""
    x.generate_fchl_representation(max_size)
    return x.representation

In [5]:
%%time
train_data['rep'] = list(map(compute_rep, map(lambda x: Compound(StringIO(x)), train_data['xyz'])))
test_data['rep'] = list(map(compute_rep, map(lambda x: Compound(StringIO(x)), test_data['xyz'])))

CPU times: user 2min 43s, sys: 12.4 s, total: 2min 56s
Wall time: 2min 58s


## Compute Atomization Energy
The original paper compares result on atomization energy. Let's use that for our purposes

In [6]:
train_data['u0_atomization'] = train_data.apply(
    lambda x: compute_atomization_energy(next(read_xyz(StringIO(x['xyz']))), x['u0'], 'b3lyp'),
    axis=1
)

In [7]:
test_data['u0_atomization'] = test_data.apply(
    lambda x: compute_atomization_energy(next(read_xyz(StringIO(x['xyz']))), x['u0'], 'b3lyp'),
    axis=1
)

In [8]:
train_data['g4mp2_atomization'] = train_data.apply(
    lambda x: compute_atomization_energy(next(read_xyz(StringIO(x['xyz']))), x['g4mp2_0k'], 'g4mp2'),
    axis=1
)

In [9]:
test_data['g4mp2_atomization'] = test_data.apply(
    lambda x: compute_atomization_energy(next(read_xyz(StringIO(x['xyz']))), x['g4mp2_0k'], 'g4mp2'),
    axis=1
)

In [10]:
train_data['delta'] = train_data['g4mp2_atomization'] - train_data['u0_atomization']

In [11]:
test_data['delta'] = test_data['g4mp2_atomization'] - test_data['u0_atomization']

## Save Data to Disk
Save the data as a compressed pickle file

In [12]:
train_data.to_pickle('train_data.pkl.gz')

In [13]:
test_data.to_pickle('test_data.pkl.gz')