In [1]:
import h5py
import numpy as np
from pprint import pprint
#f=h5py.File('aa.h5')

# example data by Lefteris (from the presentation)


atomState1={
    'Identity':{'Element':'C','AtomicNumber':6,'AtomicMass':12.0107},
    'Properties':{
        'Electrical':None,
        'Physical':{
            'PartialChargeNeutral':8.9765e-4,
            'PartialChargeAnion':-0.1256723,
            'PartialChargeCation':4.567e-3,
            'PolarizabilityNeutral':1.2345,
            'PolarizabilityAnion':2.7346,
            'PolarizabilityCation':1.7345
        },
        'Chemical':None,
    },
    'Topology':{
        'Parent':5, # molecule ID
        'Type':'ca',
        'Name':'C25',
        'Position':np.array([1.339729,115.4507,0.0]),
        'Velocity':np.array([-3.04164,5.19528283,-28.147476]),
        'Structure': [2,7,12,123] # indices of bonded atoms
    },
    'Implementation':None,
    'Part':None
}

molState1={
    'Identity':{'ChemicalName':'PC70BM','MolecularWeight':18.015},
    'Properties':{
        'Electrical':{'HOMO':-3.345,'LUMO':-2.456,'SiteEnergy':-4.765},
        'Physical':{
            'Charge':-1.0,
            'PolarizabilityNeutral':np.array([[0.05226585,-0.01557579,-0.05456737],[-.01733,0.00585766,0.02061123],[-0.0527127,0.01781754,0.06269415]]),
            'PolarizabilityAnion':np.array([[0.0344112,0.01183272,0.00055905],[0.00407375,-.0018961,-.01987108],[.03769111,-.01100703,.01663766]]),
            'PolarizabilityCation':np.array([[2.27643325e-2,-9.10359560e-3,-2.34446622e-5],[-9.5410557e-4,2.37367284e-4,1.95002721e-2],[-2.68511401e-2,6.67968934e-3,3.1221432e-3]]),
            'ReorganizationEnergyInternalNeutral':0.123,
            'ReorganizationEnergyInternalAnion':0.145,
            'ReorganizationEnergyInternalCation':0.164
        },
        'Chemical':None,
    },
    'Topology':{
        'Parent':23,
        'CenterOfMass':np.array([2.3397,5.450,-7.345]),
        'SymmetryAxis':np.array([.24961954,.30242738,.91990639]),
        'StructureNeighbors':[2,45,67,89],
        'TransferNeighbors':[4,45,23,32]
    },
    'Implementation': {'ForceFieldType':'GAFF'},
    'Part':[1,2,3,4,5]
}

grainState1={
    'Identity':{'Material':'P3HT'},
    'Properties':{
        'Electrical':{'FreeElectrons':0,'FreeHoles':2},
        'Physical':{'ReorganizationEnergyExternal':.057},
        'Chemical':None
    },
    'Topology':{
        'Parent':1,
        'CellSize': np.array([[115.59445898,2.33972946,31.29316306],[0.,115.45072951,37.91333443],[0.,0.,115.3226236]])
    },
    'Implementation': {'BoundaryCondition': 'PBC'},
    'Part': list(range(1,100001))
}



In [2]:
# hand-crafted data description

atomJson={
    'identity':{
        'element':{'dtype':'a2'},
        'atomicNumber':{'dtype':'l'},
        'atomicMass':{'dtype':'d','unit':'Dalton'},
    },
    'properties':{
        'physical':{
            'partialCharge':{
                'neutral': {'dtype':'d','unit':'e'},
                'anion': {'dtype':'d','unit':'e'},
                'cation': {'dtype':'d','unit':'e'},
            },
            'polarizability':{
                'neutral': {'dtype':'d','unit':'A^2 s^4 kg^-1'},
                'anion': {'dtype':'d','unit':'A^2 s^4 kg^-1'},
                'cation': {'dtype':'d','unit':'A^2 s^4 kg^-1'},
            }
        },
        'topology':{
            'parent':{'dtype':'l'},
            'type':{'dtype':'a100'},
            'name':{'dtype':'a100'},
            'position':{'dtype':'d','shape':[3],'unit':'AA'},
            'velocity':{'dtype':'d','shape':[3],'unit':'AA'},
            'structure':{'dtype':'l','dynamic':True},
        }
    }
}


In [33]:
import astropy.table
import astropy.units
# this will introduce 'e' but disables Daltons... :/
# astropy.units.cds.enable() # recognize 'e' as electron charge
# so define 'e' manually here
astropy.units.add_enabled_units([astropy.units.def_unit('e',astropy.constants.si.e)])

def mkAstropyQTable(name,desc,prefix=''):
    if 'dtype' in desc:
        unit=astropy.units.Unit(desc['unit']) if 'unit' in desc else astropy.units.dimensionless_unscaled
        # fixed-size array
        if 'shape' in desc: dtype=np.dtype(desc['dtype'],desc['shape'])
        # scalar or dynamic array
        else: dtype=np.dtype(desc['dtype'])
        # dynamic array
        if 'dynamic' in desc: return astropy.table.NdarrayMixin([1],dtype=dtype)
        if dtype.type==np.bytes_: return astropy.units.Quantity(value=[-1],unit=astropy.units.dimensionless_unscaled)
            #return astropy.table.Qtables.StringCol(itemsize=dtype.itemsize)
        #print(name,dtype,unit)
        return astropy.units.Quantity(value=[0],dtype=dtype,unit=unit)
    #print(name)
    return astropy.table.QTable(dict([(k,mkAstropyQTable(k,v)) for k,v in desc.items()]))
    
    # return type('%s_cols'%name,(tables.IsDescription,),dict([(k,mkNestedCols(k,v,(prefix+'_' if prefix else '')+k)) for k,v in desc.items()]))
a0t=mkAstropyQTable('Atom0',atomJson)
a0t['properties']['physical']['polarizability']['neutral'].pprint(show_unit=True)
# a0t['identity']['atomicMass']


properties
          
----------
       0.0


In [34]:
# pyTables, convert JSON to nested DType

import tables
def mkNestedCols(name,desc,prefix=''):
    if 'dtype' in desc:
        if 'shape' in desc: dtype=np.dtype(desc['dtype'],desc['shape'])
        else: dtype=np.dtype(desc['dtype'])
        if dtype.type==np.bytes_: return tables.StringCol(itemsize=dtype.itemsize)
        return tables.Col.from_dtype(dtype)
    return type('%s_cols'%name,(tables.IsDescription,),dict([(k,mkNestedCols(k,v,(prefix+'_' if prefix else '')+k)) for k,v in desc.items()]))

Atom0_cols=mkNestedCols('Atom0',atomJson)

N=100

tf=tables.open_file('test.h5',mode='w')
atomsGroup=tf.create_group('/','atoms')
atomTable=tf.create_table(atomsGroup,'atoms',Atom0_cols,'Atoms example')
atomStructure=tf.create_vlarray('/atoms','structure',atom=tables.Int64Atom(shape=()),title='atom structure',expectedrows=N)
arow=atomTable.row
for n in range(N):
    atomStructure.append(np.random.randint(0,N,np.random.randint(1,20)))
    arow['identity/element']='C'
    arow['identity/atomicNumber']=n
    arow['identity/atomicMass']=12.43+.1*n
    arow.append()
atomTable.flush()
atomStructure.flush()
tf.close()

#atomTable.flush()
#
#aa=tf.root.atoms.atoms.read()
#print(aa[1].identity.element)

In [15]:
# astropy Table

# topology/structure is left out as its shape is not pre-defined
atomDtypes=[
    ('identity/element','U2'),
    ('identity/atomicNumber','u8'),
    ('identity/atomicMass','d'),
    #('properties/electrical')
    ('properties/physical/partialCharge/neutral','d'),
    ('properties/physical/partialCharge/anion','d'),
    ('properties/physical/partialCharge/cation','d'),
    ('properties/physical/polarizability/neutral','d'),
    ('properties/physical/polarizability/anion','d'),
    ('properties/physical/polarizability/cation','d'),
    #('properties/chemical')
    ('topology/parent','i8'),
    ('topology/type','U'),
    ('topology/name','U'),
    ('topology/position',('d',(3,))),
    ('topology/velocity',('d',(3,))),
    #('topology/structure',('i4'))
    # ('topology/structure','object'), # dynamic list of ints
    #('implementation.')
    # ('part')
]
atom=np.zeros(5,dtype=atomDtypes)
import astropy.table
t=astropy.table.Table(atom)
t.write('astropy-test.hdf5',overwrite=True,path='atom')
t.write('astropy-test.fits',overwrite=True)

import pprint
pprint.pprint(atom)



array([('', 0, 0., 0., 0., 0., 0., 0., 0., 0, '', '', [0., 0., 0.], [0., 0., 0.]),
       ('', 0, 0., 0., 0., 0., 0., 0., 0., 0, '', '', [0., 0., 0.], [0., 0., 0.]),
       ('', 0, 0., 0., 0., 0., 0., 0., 0., 0, '', '', [0., 0., 0.], [0., 0., 0.]),
       ('', 0, 0., 0., 0., 0., 0., 0., 0., 0, '', '', [0., 0., 0.], [0., 0., 0.]),
       ('', 0, 0., 0., 0., 0., 0., 0., 0., 0, '', '', [0., 0., 0.], [0., 0., 0.])],
      dtype=[('identity/element', '<U2'), ('identity/atomicNumber', '<u8'), ('identity/atomicMass', '<f8'), ('properties/physical/partialCharge/neutral', '<f8'), ('properties/physical/partialCharge/anion', '<f8'), ('properties/physical/partialCharge/cation', '<f8'), ('properties/physical/polarizability/neutral', '<f8'), ('properties/physical/polarizability/anion', '<f8'), ('properties/physical/polarizability/cation', '<f8'), ('topology/parent', '<i8'), ('topology/type', '<U'), ('topology/name', '<U'), ('topology/position', '<f8', (3,)), ('topology/velocity', '<f8', (3,))])


In [None]:
# pandas: convert JSON to flat DType

import dataclasses
@dataclasses.dataclass
class FlatRecItem:
    name: str
    dtype: str
    shape: tuple=None
    unit: str=None
    def mkDtype(self):
        if self.shape is not None: return (self.name,(self.dtype,tuple(self.shape)))
        return (self.name,self.dtype)

def mkFlatRecItems(desc,prefix='',sep='.'):
    ret=[]
    # primitive type
    if 'dtype' in desc:
        p,d=prefix,desc['dtype']
        return [FlatRecItem(name=p,dtype=d,shape=tuple(desc['shape']) if 'shape' in desc else None,unit=desc.get('unit',None))]
    else:
        assert isinstance(desc,dict)
        for k,v in desc.items():
            ret+=mkFlatRecItems(desc=v,prefix=(prefix+sep if prefix else '')+k)
    return ret

atomDtype=[i.mkDtype() for i in mkFlatRecItems(atomJson)]
pprint(atomDtype)


In [13]:
# dictionary with flat values, used to fill the DataFrame
# throw exception with non-1D data

atomDict={
    'identity.element':'C',
    'identity.atomicNumber':6,
    'identity.atomicMass':12.0107,
    'properties.physical.partialCharge.neutral':8.9765e-4,
    'properties.physical.partialCharge.anion':-0.1256723,
    'properties.physical.partialCharge.cation':4.567e-3,
    'properties.physical.polarizability.neutral':1.2345,
    'properties.physical.polarizability.anion':2.7346,
    'properties.physical.polarizability.cation':1.7345,
    'topology.parent':5, # molecule ID
    'topology.type':'ca',
    'topology.name':'C25',
    'topology.position':np.array([1.339729,115.4507,0.0]),
    'topology.velocity':np.array([-3.04164,5.19528283,-28.147476]),
    # 'topology.structure': [2,7,12,123] # indices of bonded atoms
}
atomItems=tuple(atomDict.items())
atomValues=tuple(atomDict.values())
pprint(atomDict)
pprint(atomDtype)
pprint(atomValues)

    
import pandas as pd
atom0=np.recarray((1),dtype=atomDtype)
atom1=np.core.records.fromrecords([atomValues],dtype=atomDtype)
pt=tables.File('test2.h5','w')
t1=pt.create_table('/',name='atoms',description=atom1)
print(t1)
print(atom0)
import dask.dataframe
df=pd.DataFrame(atom1)
print(df)
atom2=np.array(list(atomDict.items()),dtype=atomDtype)
print(atom2)
print(pd.DataFrame.from_dict(data=atomDict,index=[0],orient='index'))
pprint(mkRecItems(atomJson))
pt.close()


{'identity.atomicMass': 12.0107,
 'identity.atomicNumber': 6,
 'identity.element': 'C',
 'properties.physical.partialCharge.anion': -0.1256723,
 'properties.physical.partialCharge.cation': 0.004567,
 'properties.physical.partialCharge.neutral': 0.00089765,
 'properties.physical.polarizability.anion': 2.7346,
 'properties.physical.polarizability.cation': 1.7345,
 'properties.physical.polarizability.neutral': 1.2345,
 'topology.name': 'C25',
 'topology.parent': 5,
 'topology.position': array([  1.339729, 115.4507  ,   0.      ]),
 'topology.type': 'ca',
 'topology.velocity': array([ -3.04164   ,   5.19528283, -28.147476  ])}
[('identity.element', 'a2'),
 ('identity.atomicNumber', 'l'),
 ('identity.atomicMass', 'd'),
 ('properties.physical.partialCharge.neutral', 'd'),
 ('properties.physical.partialCharge.anion', 'd'),
 ('properties.physical.partialCharge.cation', 'd'),
 ('properties.physical.polarizability.neutral', 'd'),
 ('properties.physical.polarizability.anion', 'd'),
 ('properties.

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


Exception: Data must be 1-dimensional