In [1]:
import os, sys
import glob
from rdkit import Chem
import numpy as np
import h5py
import yaml

### Load hdf5

In [2]:
files = glob.glob("Open*.hdf5")
files.sort()

In [3]:
files

['OpenFF-PEPCONF-OptimizationDataset-part1.hdf5',
 'OpenFF-PEPCONF-OptimizationDataset-part2.hdf5',
 'OpenFF-PEPCONF-OptimizationDataset-part3.hdf5',
 'OpenFF-PEPCONF-OptimizationDataset-part4.hdf5']

### Check data format

In [4]:
data = h5py.File(files[0])
groups = [ key for key in data.keys() ]

In [5]:
groups

['CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(N)=O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(=O)O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(N)=O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CO)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CS)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CCC(=O)O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CCC(N)=O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1c[nH]cn1)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N1CCC[C@H]1C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1c[nH]cn1)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccc(O)cc1)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(N)=O',
 'CC(=O)N1

In [6]:
data[groups[0]].keys()

<KeysViewHDF5 ['atomic_numbers', 'conformations', 'smiles', 'subset', 'total_energy', 'total_gradient']>

In [7]:
data[groups[0]]['atomic_numbers']

<HDF5 dataset "atomic_numbers": shape (37,), type "<i2">

In [8]:
data[groups[0]]['conformations']

<HDF5 dataset "conformations": shape (86, 37, 3), type "<f4">

In [9]:
data[groups[0]]['smiles']

<HDF5 dataset "smiles": shape (1,), type "|O">

In [10]:
data[groups[0]]['subset']

<HDF5 dataset "subset": shape (1,), type "|O">

In [11]:
data[groups[0]]['total_energy']

<HDF5 dataset "total_energy": shape (86,), type "<f8">

In [12]:
data[groups[0]]['total_gradient']

<HDF5 dataset "total_gradient": shape (86, 37, 3), type "<f4">

### Grab unique group names and corresponding hdf5 file

In [13]:
data = []
for file in files:
    f = h5py.File(file)
    for group in f.keys():
        data.append([group, file])
    f.close()

In [14]:
len(data)

571

In [15]:
import pandas as pd
df = pd.DataFrame(data, columns=["GROUP", "FILENAME"])

In [16]:
df

Unnamed: 0,GROUP,FILENAME
0,CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(N)=O,OpenFF-PEPCONF-OptimizationDataset-part1.hdf5
1,CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N[C@...,OpenFF-PEPCONF-OptimizationDataset-part1.hdf5
2,CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(N)=O)C(N)=O,OpenFF-PEPCONF-OptimizationDataset-part1.hdf5
3,CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(N)=O)C(=O)N[...,OpenFF-PEPCONF-OptimizationDataset-part1.hdf5
4,CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(N)=O)C(N)=O,OpenFF-PEPCONF-OptimizationDataset-part1.hdf5
...,...,...
566,O=C1NCC(=O)N2CCC[C@H]2C(=O)NCC(=O)N2CCC[C@@H]12,OpenFF-PEPCONF-OptimizationDataset-part4.hdf5
567,O=C1NCC(=O)N2CCC[C@H]2C(=O)NCC(=O)N2CCC[C@H]2C...,OpenFF-PEPCONF-OptimizationDataset-part4.hdf5
568,O=C1[C@@H]2CCCN2C(=O)[C@H]2CCCN2C(=O)[C@H]2CCC...,OpenFF-PEPCONF-OptimizationDataset-part4.hdf5
569,O=C1[C@@H]2CCCN2C(=O)[C@H]2CCCN2C(=O)[C@H]2CCC...,OpenFF-PEPCONF-OptimizationDataset-part4.hdf5


In [17]:
# this should be 568

len(df['GROUP'].unique())

568

In [18]:
_df = df['GROUP'].unique()
groups = _df.tolist()
len(groups)

568

In [19]:
df[df['GROUP'] == groups[0]]

Unnamed: 0,GROUP,FILENAME
0,CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(N)=O,OpenFF-PEPCONF-OptimizationDataset-part1.hdf5


In [20]:
df[df['GROUP'] == groups[0]]['FILENAME'][0]

'OpenFF-PEPCONF-OptimizationDataset-part1.hdf5'

check if a group of with the same molecule is saved in a different hdf5 file

In [21]:
# check
for group in groups:    
    filenames = df[df['GROUP'] == group]['FILENAME'].tolist()
    #print(key, len(filenames))    
    if len(filenames) != 1:
        print(group, len(filenames))    

CSCC[C@H](NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(C)=O)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(N)=O 2
CSCC[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCSC)NC(C)=O)C(N)=O 2
CC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(N)=O 2


### merge hdf5 files

In [22]:
outputfile = h5py.File('PEPCONF-OPTIMIZATION-DATASET-OPENFF-DEFAULT.hdf5', 'w')

In [23]:
# Add the data to the HDF5 file.
for name in groups:    
    group = outputfile.create_group(name)
    
    files = df[df['GROUP'] == name]['FILENAME'].tolist()
    xyz = []
    qvars1 = []
    qvars2 = []
    for file in files:
        f = h5py.File(file)
        data = f[name]
        subset = data['subset']
        smiles = data['smiles']
        atomic_numbers = data['atomic_numbers']
        xyz.append(data['conformations'])
        qvars1.append(data['total_energy'])
        qvars2.append(data['total_gradient'])
        #f.close()
    
    group.create_dataset('subset', data=[subset[0].decode('utf-8')], dtype=h5py.string_dtype())
    group.create_dataset('smiles', data=[smiles[0].decode('utf-8')], dtype=h5py.string_dtype())
    a = [ a for a in atomic_numbers ]
    group.create_dataset("atomic_numbers", data=a, dtype=np.int16)

    # conformations
    confs = []
    for _xyz in xyz:
        for _ in _xyz:
            confs.append(_)
    ds = group.create_dataset('conformations', data=np.array(confs), dtype=np.float32)
    ds.attrs['units'] = 'bohr'

    # total energy
    e = []
    for qvar in qvars1:
        for _ in qvar:
            e.append(_)
    ds = group.create_dataset('total_energy', data=np.array(e), dtype=np.float64)
    ds.attrs['units'] = 'hartee'

    # total gradient
    e = []
    for qvar in qvars2:
        for _ in qvar:
            e.append(_)
    ds = group.create_dataset('total_gradient', data=np.array(e), dtype=np.float32)
    ds.attrs['units'] = 'hartee/bohr'

### Load new hdf5

In [24]:
f = h5py.File('PEPCONF-OPTIMIZATION-DATASET-OPENFF-DEFAULT.hdf5')

In [25]:
len(f.keys())

568

In [26]:
groups = [ key for key in f.keys() ]

In [27]:
groups

['CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(=O)O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(N)=O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1c[nH]cn1)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccc(O)cc1)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N1CCC[C@H]1C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(=O)O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(N)=O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1c[nH]cn1)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](Cc1ccc(O)cc1)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(N)=O)C(N)=O',
 'CC(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(=O)O)C(=O)N1CCC[C@H]1C(N)=O',
 'CC(

In [28]:
#group = "CSCC[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(C)=O)C(N)=O"
#data = f[group]

In [29]:
f[groups[0]].keys()

<KeysViewHDF5 ['atomic_numbers', 'conformations', 'smiles', 'subset', 'total_energy', 'total_gradient']>

In [30]:
f[groups[0]]['atomic_numbers']

<HDF5 dataset "atomic_numbers": shape (51,), type "<i2">

In [31]:
f[groups[0]]['conformations']

<HDF5 dataset "conformations": shape (242, 51, 3), type "<f4">

In [32]:
f[groups[0]]['smiles']

<HDF5 dataset "smiles": shape (1,), type "|O">

In [33]:
f[groups[0]]['subset']

<HDF5 dataset "subset": shape (1,), type "|O">

In [34]:
f[groups[0]]['total_energy']

<HDF5 dataset "total_energy": shape (242,), type "<f8">

In [35]:
f[groups[0]]['total_gradient']

<HDF5 dataset "total_gradient": shape (242, 51, 3), type "<f4">

In [36]:
f[groups[0]]['total_energy'][0]

-1183.5471905810098

In [37]:
group = "CC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(N)=O"
data = f[group]
data['conformations']

<HDF5 dataset "conformations": shape (86, 37, 3), type "<f4">