# Parse PBD Molecules
We also have the solvation energies of PDB molecules

In [1]:
from jcesr_ml.utils import compute_atomization_energy
from dlhub_sdk.models.datasets import TabularDataset
from pymatgen.io.xyz import XYZ
from ase.io.xyz import read_xyz
from pybel import readstring
from io import StringIO
from glob import glob
from tqdm import tqdm
from tarfile import TarFile
import pandas as pd
import gzip
import json
import os

In [2]:
output_path = os.path.join('data', 'output', 'pdb_data.json')

## Find the Files
Load in the list of files from a TarFile

In [3]:
tar = TarFile(fileobj=gzip.open(os.path.join('data', 'input', 'PDB_large_molecules_191.tar.gz')))

In [4]:
files = [x for x in tar.getmembers() if x.name.endswith('.xyz')]
print(f'Found {len(files)} XYZ files')

Found 191 XYZ files


In [5]:
print(tar.extractfile(files[0]).read().decode())

35
Lattice="1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0" Properties=species:S:1:pos:R:3 pbc="F F F"
O      -6.51686000      -0.61602700      -0.00039300 
O      -5.51527900       1.39380300      -0.00016200 
C       0.96293300      -0.52199000       0.00006700 
C       2.22995000       0.34011200       0.00012200 
C      -0.33295700       0.29586300       0.00020900 
C       3.52637800      -0.47705100      -0.00008300 
C      -1.59981700      -0.56637900       0.00016800 
C       4.79338100       0.38463300       0.00001700 
C      -2.89051900       0.25851500       0.00026700 
C       6.09026600      -0.43186500      -0.00020100 
C      -4.14756000      -0.61082800       0.00025100 
C       7.35096100       0.43697800      -0.00012400 
C      -5.42877900       0.19228700      -0.00012100 
H       0.97446300      -1.18437600       0.87709600 
H       0.97439900      -1.18418200      -0.87711100 
H       2.21819200       1.00228000       0.87721300 
H       2.21808700       1.00254900      -0.

## Parse Data
Get the key fields and render the data as a dataframe

In [6]:
def extract_solvation_energy(lines):
    """
    Extract the solvation energies
    
    Args:
        lines ([string]): Lines from the XYZ file
    Returns:
        (dict): Properites related to solubility calculations
    """
    
    # Generate list of JSCER properties
    properties = ['sol_water', 'sol_acn', 'sol_dmso']
    return dict((name, float(line.split()[-1])) for name, line in 
               zip(properties, lines[-3:]))
# extract_g4mp2(content) 

In [7]:
def get_clean_xyz_file(lines):
    """The XYZ files in our dataset don't parse with ASE or pymatgen easily
    
    This operation cleans up the file to make it parse easily
    
    Args:
        lines ([string]): Entire file
    Returns:
        (string) File in a cleaner XYZ format
    """
    
    # Force pymatgen's XYZ parser to read the file
    mol = XYZ._from_frame_string("\n".join(lines))
    
    # Convert the molecule back to XYZ format
    return XYZ(mol).__str__()

In [8]:
def get_counts(xyz):
    """Given the XYZ coordinates of a molecule, compute its size
    
    Args:
        xyz (string): XYZ-format file of the atomic structure
    Returns:
        (dict) Different measures of molecular size:
            n_heavy_atom (int): Number of heavy (i.e., non-Hydrogen) atoms
            n_electrons (int): Number of electrons in the system
    """
    
    mol = read_xyz(StringIO(xyz)).__next__()
    Z = mol.get_atomic_numbers()
    return {
        'n_atoms': len(Z),
        'n_heavy_atoms': (Z > 1).sum(),
        'n_electrons': Z.sum(), 
    }

In [9]:
def parse_files(file_list):
    """Convert XYZ files to a dataframe
    
    Args:
        file_list ([str]): Path to files to be extracted
    Returns:
        (DataFrame) Dataset
    """
    matrix = []
    for file in tqdm(file_list):
        with tar.extractfile(file) as fp:
            lines = fp.read().decode().strip().split("\n")
        
        # Extract the numerical data
        item = extract_solvation_energy(lines)
        
        # Store the file name and XYZ file
        item['name'] = os.path.basename(file.name)
        item['xyz'] = get_clean_xyz_file(lines)
        
        # Compute the smiles 
        item['smiles'] = readstring('xyz', item['xyz']).write('smiles')
        
        # Get some metrics of the atomic size
        item.update(get_counts(item['xyz']))
        
        matrix.append(item)
        
    df = pd.DataFrame(matrix)
    return df     

In [10]:
dataset = parse_files(files)

100%|██████████| 191/191 [00:00<00:00, 494.00it/s]


In [11]:
dataset['source'] = 'pdb'

Sort it into a reproducible order

In [12]:
dataset.sort_values('name', inplace=True)

## Mark a Test Set
Just in case we want to do any kind of hold-out test later

In [13]:
dataset['in_holdout'] = False

In [14]:
dataset.loc[dataset.sample(frac=0.1, random_state=1).index, 'in_holdout'] = True

## Save the Data with a Description
Use the DLHub SDK to describe this data

In [15]:
dataset.to_json(output_path, 'records', lines=True)

Make the description

In [16]:
metadata = TabularDataset.create_model(output_path, format='json', read_kwargs={'lines': True})

In [17]:
metadata.set_title('Solvation Energy of {} Large Molecules'.format(len(dataset)))
metadata.set_name('pdb_solv')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f3fba115860>

TBD: Get other authors from Rajeev

In [18]:
metadata.set_authors(['Dandu, Naveen', 'Assary, Rajeev', 'Narayanan, Badri', 'Curtiss, Larry'],
                     [['Argonne National Laboratory']]*4)

<dlhub_sdk.models.datasets.TabularDataset at 0x7f3fba115860>

In [19]:
for sol in dataset.columns:
    if sol.startswith('sol_'):
        metadata.annotate_column(sol, 'Solubility in {}'.format(sol.split("_")[-1]))

In [20]:
metadata.annotate_column('name', 'Filename, which contains the name of the molecule')
metadata.annotate_column('source', 'Source of the dataset')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f3fba115860>

In [21]:
metadata.annotate_column('in_holdout', 'Whether the entry is in the test set')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f3fba115860>

In [22]:
metadata.annotate_column('n_atoms', 'Number of atoms')
metadata.annotate_column('n_electrons', 'Number of electrons')
metadata.annotate_column('n_heavy_atoms', 'Number of non-hydrogen atoms')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f3fba115860>

In [23]:
metadata.annotate_column('smiles', 'SMILES string after relaxation', data_type='string')
metadata.annotate_column('xyz', 'XYZ coordinates after relaxation', data_type='XYZ file')

<dlhub_sdk.models.datasets.TabularDataset at 0x7f3fba115860>

In [24]:
metadata.mark_inputs(['smiles', 'xyz'])

<dlhub_sdk.models.datasets.TabularDataset at 0x7f3fba115860>

In [25]:
assert len(metadata.get_unannotated_columns()) == 0

In [26]:
with open(os.path.join(os.path.dirname(output_path),
                       '{}-description.json'.format(metadata['dlhub']['name'])), 'w') as fp:
    json.dump(metadata.to_dict(), fp, indent=2)