# Parse Large Molecules
We also have the G4MP2 energies of large molecules. This script reads them in from the format I was sent, and puts them into a DataFrame.

In [1]:
from jcesr_ml.utils import compute_atomization_energy
from dlhub_sdk.models.datasets import TabularDataset
from pymatgen.io.xyz import XYZ
from ase.io.xyz import read_xyz
from pybel import readstring
from io import StringIO
from glob import glob
from tqdm import tqdm
import pandas as pd
import json
import os

In [2]:
output_path = os.path.join('data', 'output', 'sugar_data.json')

## Find the Files
Load in the list of files from disk

In [3]:
files = sorted(glob(os.path.join('data', 'input', 'GDB_Sugar_Data', '*.xyz')))
print('Found {} files'.format(len(files)))

Found 169 files


Print out a file

In [4]:
with open(files[0]) as fp:
    content = fp.readlines()
    print(''.join(content))

15
Lattice="1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0" Properties=species:S:1:pos:R:3:Z:I:1 pbc="F F F"
C       1.31302700      -0.01963100      -0.00001900        6 
O       1.51532300       1.17613700      -0.00001300        8 
O       2.33757700      -0.91535000      -0.00003900        8 
H       3.14699500      -0.38391600      -0.00004800        1 
C       0.01695900      -0.70499200      -0.00000000        6 
C      -1.17617700      -0.08344200       0.00002500        6 
C      -2.43710300      -0.92361700       0.00000700        6 
H      -3.33801500      -0.30690900       0.00044300        1 
H      -2.47489900      -1.57478400      -0.88000900        1 
H      -2.47449000      -1.57546800       0.87953000        1 
H       0.06364900      -1.78926000      -0.00000500        1 
C      -1.33094400       1.41699900       0.00004400        6 
H      -0.83845600       1.86318700       0.86873000        1 
H      -0.83835800       1.86322800      -0.86856400        1 
H      -2.38419900  

As in the original dataset, we have G4MP2 data at the bottom of an XYZ file

## Load in B3LYP Data
This was computed separately, and is available in a separate text file

In [5]:
b3lyp_data = pd.read_csv(os.path.join('data', 'input', 'GDB_Sugar_Data', 'Energy_GDB_data.txt'), delim_whitespace=True)

In [6]:
b3lyp_data.head()

Unnamed: 0,filename,zpe,E0,Ee
0,furan-hydroxymethylfurfuraldehyde.in.out,0.112218,-457.799384,-457.911602
1,phenol-2ethylphenol.in.out,0.160611,-385.967323,-386.127934
2,oxygenates-maltol.in.out,0.11302,-457.825407,-457.938427
3,aromatics-orthoxylene.in.out,0.155303,-310.751167,-310.90647
4,furan-dimethylfuran.in.out,0.124978,-308.565897,-308.690875


Make a column with the data name

In [7]:
b3lyp_data['name'] = b3lyp_data['filename'].apply(lambda x: x.split(".")[0])

Rename the columns to match QM9 nomenclature, and drop columns not found in QM9

In [8]:
b3lyp_data.rename(columns={'E0': 'u0'}, inplace=True)

In [9]:
b3lyp_data.drop(['filename', 'Ee'], 'columns', inplace=True)

It contains the filename and B3LPY energies. We will turn it into a dict of "molecule name" -> properties for easier use

In [10]:
b3lyp_data = b3lyp_data.set_index('name').to_dict('index')

We now have an easy to use dict of B3LYP properties

## Parse Data
Get the key fields and render the data as a dataframe

In [11]:
def extract_g4mp2(lines):
    """
    Extract the G4MP2 outputs and solvation energies
    
    Args:
        lines ([string]): Lines from the XYZ file
    Returns:
        (dict): Properites related to the G4MP2 otuputs and solubility calculations
    """
    
    # Generate list of JSCER properties
    properties = ['g4mp2_0k', 'g4mp2_zpe', 'g4mp2_energy', 'g4mp2_enthalpy', 'g4mp2_free',
                  'sol_water', 'sol_acn', 'sol_dmso']
    return dict((name, float(line.split()[-1])) for name, line in 
               zip(properties, lines[-9:-4] + lines[-3:]))
# extract_g4mp2(content) 

In [12]:
def get_clean_xyz_file(lines):
    """The XYZ files in our dataset don't parse with ASE or pymatgen easily
    
    This operation cleans up the file to make it parse easily
    
    Args:
        lines ([string]): Entire file
    Returns:
        (string) File in a cleaner XYZ format
    """
    
    # Force pymatgen's XYZ parser to read the file
    mol = XYZ._from_frame_string("".join(lines))
    
    # Convert the molecule back to XYZ format
    return XYZ(mol).__str__()
xyz = get_clean_xyz_file(content)

In [13]:
def get_counts(xyz):
    """Given the XYZ coordinates of a molecule, compute its size
    
    Args:
        xyz (string): XYZ-format file of the atomic structure
    Returns:
        (dict) Different measures of molecular size:
            n_atom (int): Number of atoms
            n_heavy_atom (int): Number of heavy (i.e., non-Hydrogen) atoms
            n_electrons (int): Number of electrons in the system
    """
    
    mol = read_xyz(StringIO(xyz)).__next__()
    Z = mol.get_atomic_numbers()
    return {
        'n_atom': len(Z),
        'n_heavy_atoms': (Z > 1).sum(),
        'n_electrons': Z.sum(), 
    }

In [14]:
def get_atomization_energies(xyz, u0, g4mp2_0k):
    """Compute the atomization energies for each molecule
    
    Args:
        xyz (string): XYZ-format file of the atomic structure
        u0 (float): B3LYP 0K total energy
        g4mp2_0k (float): G4MP2 total energy
    Returns:
        (dict) With computed total energies:
            u0_atom (float): B3LYP atomization energy (Ha)
            g4mp2_atom (float): B3LYP atomization energy (Ha)
    """
    
    mol = read_xyz(StringIO(xyz)).__next__()
    return {
        'u0_atom': compute_atomization_energy(mol, u0, 'b3lyp'),
        'g4mp2_atom': compute_atomization_energy(mol, g4mp2_0k, 'g4mp2')
    }

In [15]:
def parse_files(file_list):
    """Convert XYZ files to a dataframe
    
    Args:
        file_list ([str]): Path to files to be extracted
    Returns:
        (DataFrame) Dataset
    """
    matrix = []
    for file in tqdm(file_list):
        with open(file) as fp:
            lines = fp.readlines()
        
        # Extract the numerical data
        item = extract_g4mp2(lines)
        
        # Store the file name and XYZ file
        item['name'] = os.path.basename(file).split(".")[0]
        item['xyz'] = get_clean_xyz_file(lines)
        
        # Compute the smiles 
        item['smiles'] = readstring('xyz', item['xyz']).write('smiles')
        
        # Get some metrics of the atomic size
        item.update(get_counts(item['xyz']))
        
        # Get the B3LYP data
        item.update(b3lyp_data[item['name']])
        
        # Get the atomization energies
        item.update(get_atomization_energies(item['xyz'],
                                             item['u0'],
                                             item['g4mp2_0k']))
        
        matrix.append(item)
        
    df = pd.DataFrame(matrix)
    return df     

In [16]:
dataset = parse_files(files)

100%|██████████| 169/169 [00:00<00:00, 406.77it/s]


Sort it into a reproducible order

In [17]:
dataset.sort_values('name', inplace=True)

Get only the molecules larger than 9 heavy atoms

In [18]:
dataset.query('n_heavy_atoms > 9', inplace=True)
print('Found {} molecules with >9 heavy atoms'.format(len(dataset)))

Found 67 molecules with >9 heavy atoms


## Remove an Outlier
One of the moelcules has a very low energy, which we believe is the result of a faulty calculation

In [19]:
dataset.query('name != "syringol-4-propylsyringol"', inplace=True)

## Mark a Test Set
Just in case we want to do any kind of hold-out test later

In [20]:
dataset['in_holdout'] = False

In [21]:
dataset.loc[dataset.sample(frac=0.1, random_state=1).index, 'in_holdout'] = True

## Save the Data with a Description
Use the DLHub SDK to describe this data

In [22]:
dataset.to_json(output_path, 'records', lines=True)

Make the description

In [23]:
metadata = TabularDataset.create_model(output_path, format='json', read_kwargs={'lines': True})

In [24]:
metadata.set_title('Solubilities and G4MP2 Energies of {} Sugar Molecules'.format(len(dataset)))
metadata.set_name('G4MP2-heavy')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f41dba207b8>

TBD: Get other authors from Rajeev

In [25]:
metadata.set_authors(['Assary, Rajeev', 'Narayanan, Badri', 'Cheng, Lei', 'Curtiss, Larry'],
                     [['Argonne National Laboratory']]*4)

<dlhub_sdk.models.datasets.TabularDataset at 0x7f41dba207b8>

In [26]:
for name, desc in [('g4mp2_0k', 'G4MP2 Internal energy at 0K'), ('g4mp2_energy', 'G4MP2 Internal energy at 298.15K'),
                   ('g4mp2_enthalpy', 'G4MP2 Enthalpy at 298.15K'), ('g4mp2_free', 'G4MP2 Free eergy at 0K'), 
                   ('g4mp2_atom', 'G4MP2 atomization energy at 0K'), ('g4mp2_zpe', 'Zero-point energy of molecule'),
                   ('zpe', 'B3LYP Zero point vibrational energy'), ('u0', 'B3LYP Internal energy at 0K'), 
                   ('u0_atom', 'B3LYP atomization energy at 0K')]:
    metadata.annotate_column(name, description=desc, units='Ha')

In [27]:
for sol in dataset.columns:
    if sol.startswith('sol_'):
        metadata.annotate_column(sol, 'Solubility in {}'.format(sol.split("_")[-1]))

In [28]:
metadata.annotate_column('name', 'Filename, which contains the name of the molecule')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f41dba207b8>

In [29]:
metadata.annotate_column('in_holdout', 'Whether the entry is in the test set')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f41dba207b8>

In [30]:
metadata.annotate_column('n_atom', 'Number of atoms')
metadata.annotate_column('n_electrons', 'Number of electrons')
metadata.annotate_column('n_heavy_atoms', 'Number of non-hydrogen atoms')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f41dba207b8>

In [31]:
metadata.annotate_column('smiles', 'SMILES string after relaxation', data_type='string')
metadata.annotate_column('xyz', 'XYZ coordinates after relaxation', data_type='XYZ file')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f41dba207b8>

In [32]:
metadata.mark_inputs(['smiles', 'xyz'])

<dlhub_sdk.models.datasets.TabularDataset at 0x7f41dba207b8>

In [33]:
assert len(metadata.get_unannotated_columns()) == 0

In [34]:
with open(os.path.join(os.path.dirname(output_path),
                       '{}-description.json'.format(metadata['dlhub']['name'])), 'w') as fp:
    json.dump(metadata.to_dict(), fp, indent=2)