In [106]:
import os,sys,json,re
sys.path.insert(0, '../../scripts/python')
from prettyjson import prettyjson
import ase
from ase.io import read
from ase.visualize import view
import numpy as np
import ubjson

In [4]:
!ls repo-fit-bulk/

README                                 mebox-minimal-oos-T188-pbe-b1b.xyz
dispts_quip_params.xml                 mebox-minimal-oos-T188-pbe-mbdint.xyz
fit_mbd_soap.sh                        mebox-minimal-oos-T188-pbe0-b1b.xyz
[31mfit_soap_box.sh[m[m                        mebox-minimal-oos-T188-pbe0-mbdint.xyz
mebox-minimal-oos-T110-pbe-b1b.xyz     mebox-minimal-pbe-b1b.xyz
mebox-minimal-oos-T110-pbe-mbdint.xyz  mebox-minimal-pbe-mbdint.xyz
mebox-minimal-oos-T110-pbe0-b1b.xyz    mebox-minimal-pbe0-b1b.xyz
mebox-minimal-oos-T110-pbe0-mbdint.xyz mebox-minimal-pbe0-pbeotf-mbdint.xyz


In [49]:
fn = '/Users/iMac/Downloads/input.data'
with open(fn, 'r') as f:
    data = f.read().encode('unicode-escape').decode()

In [111]:
patterns = {
    'struture':r'(?<=begin).*?(?=end)',
    'lattice':r'(?<=lattice).*?(?=\\n)',
    'atom':r'(?<=atom).*?(?=\\n)',
   'float':r"[-+]?[0-9]*\.?[0-9]+",
    'energy':r"energy\s+([-+]?[0-9]*\.?[0-9]+)\s+\\n",
   'species':r"[BCFHIKNOPSUVWY]|A[cglmrstu]|B[aehikr]|C[adeflmnorsu]|D[bsy]|E[rsu]|F[elmr]|G[ade]|H[efgos]|I[nr]|Kr|L[airuv]|M[dgnot]|N[abdeiop]|Os|P[abdmortu]|R[abefghnu]|S[bcegimnr]|T[abcehilm]|Uu[opst]|Xe|Yb|Z[nr]",
           }
frames = []
for res in results:
    ddd = re.findall(patterns['lattice'] , res)
    lat = []
    for d in ddd:
        lat.append(list(map(float,re.findall(patterns['float'] , d))))
    
    ddd = re.findall(patterns['atom'] , res)
    pos,sym,force = [],[],[]
    for d in ddd:
        aa = d.split()
        pos.append(list(map(float,aa[:3])))
        force.append(list(map(float,aa[-3:])))
        aa = aa[3]
        sym.append(aa)
    energy = float(re.findall(patterns['energy'] , res)[0])
    frame = ase.Atoms(positions=pos,symbols=sym,cell=lat,pbc=True)
    frame.info['energy'] = energy
    frame.set_array('forces',np.array(force))
    frame.wrap(eps=1e-11)
    frames.append(frame)

In [154]:
frame.info

{'energy': -1272.64893}

In [167]:
def tofile(fn,frames):
    keys = ['positions','cell','numbers','pbc']
    data = {}
    data['ids'] = list(map('{}'.format,range(1, len(frames)+1)))
    for ii,frame in zip(data['ids'], frames):
        aa = dict(positions=frame.get_positions().tolist(),cell=frame.get_cell().tolist(),
                numbers=frame.get_atomic_numbers().tolist(),pbc=frame.get_pbc().tolist())
        aa['info'] = frame.info
        aa['arrays'] = {}
        for k,v in frame.arrays.items():
            if k not in keys:
                aa['arrays'][k] = v.tolist()
        data[ii] = aa
    data['nextid'] = '{}'.format(len(frames)+1)
    _, extension = os.path.splitext(fn)
    if extension == '.json':
        with open(fn, 'w') as f:
            data_pretty = prettyjson(data,indent=2, maxlinelength=80)
            f.write(data_pretty)
    elif extension == '.ubjson':
        import ubjson
        with open(fn, 'wb') as f:
            ubjson.dump(data, f,no_float32=False)
def fromfile(fn):
    _, extension = os.path.splitext(fn)
    if extension == '.json':
        frames = read(fn, ':')
    elif extension == '.ubjson':
        import ubjson
        with open(fn, 'rb') as f:
            data = ubjson.load(f)
        frames = []
        for idx in data['ids']:
            ff = data['{}'.format(idx)]
            frame = ase.Atoms(positions=ff['positions'], cell=ff['cell'], 
                              numbers=ff['numbers'], pbc=ff['pbc'])
            if 'info' in ff:
                frame.info = ff['info']
            if 'arrays' in ff:
                for k,v in ff['arrays'].items():
                    frame.set_array(k, np.array(v))
            frames.append(frame)
    return frames

In [159]:
fn_out = '../methane_sulfonic.ubjson'
tofile(fn_out,frames)

In [168]:
frames = fromfile(fn_out)

In [169]:
len(frames)

4141

In [170]:
view(frames)

In [5]:
fns = {'molecular_crystals':'CSD1000-r.xyz',
       'silicon_bulk':'gp_iter6_sparse9k.xml.xyz',
       'qm9':'qm9_5000.xyz',
       'methane_liquide':'repo-fit-bulk/mebox-minimal-pbe0-b1b.xyz'}
for k,fn in fns.items():
    frames = read(fn,':')
    print(k,len(frames))
    fn_out = '../'+k+'.json'
    tofile(fn_out,frames)

molecular_crystals 1000
silicon_bulk 2475
qm9 5000
methane_liquide 258
