# Convert hMOF to JSON
[hMOF](https://mof.tech.northwestern.edu/databases) is one of the classic MOF databases. We used it as a training set for our generative models, seed for new MOFs, and a basis for supervised learning models employed in our pipeline.

In [1]:
from mofa.model import MOFRecord, LigandDescription, NodeDescription
from zipfile import ZipFile
from pathlib import Path
from ase.io import read
from io import StringIO
from tqdm import tqdm
import warnings
import json
import gzip

Configuration

In [2]:
hmof_path = './raw-data/hMOF-10 1039 C2EE23201D-all-mofdb-version_dc8a0295db.zip'
out_name = 'hmof'

Derived

In [3]:
out_path = Path('data') / f'{out_name}.json.gz'

## Load in Example Data
Each record of the hMOF dataset is a JSON file held within the ZIP file. Let's pull the first one to start with

In [4]:
with ZipFile(hmof_path) as zp:
    example_info = next(x for x in zp.infolist() if x.filename.endswith('.json'))
    content = json.loads(zp.read(example_info).decode())

## Create Functions for Converting Data
The the MOF database has an excellent schema, making it easy to convert into the simplified one we'll use in our workflows.

First step is to decompose [MOFid](https://pubs.acs.org/doi/full/10.1021/acs.cgd.9b01050) -> ligand descriptions

In [5]:
def parse_mofid(mofid: str) -> tuple[list[LigandDescription], NodeDescription, str, int]:
    """Initialize the description of a MOF from the MOFid
    
    Args:
        mofid: MOFID from the record
    Returns: 
        - List of the ligands
        - Description of the nodes
        - Topology name
        - Catentation number
    """
    
    # Drop off any user comments
    no_comment = mofid.split(";")[0]
    
    # Split the chemical description from the 3D structure
    chemical, structure = no_comment.split(" ")
    
    # Determine the topology and catenation
    _, topology, cat = structure.split(".")
    cat = int(cat[3:])
    
    # Store the ligand and node descriptions
    #  Assumes the last point is the linker
    chemicals = chemical.split(".")
    ligands = [LigandDescription(smiles=x) for x in chemicals[:-1]]
    node = NodeDescription(chemicals[-1])
    return ligands, node, topology, cat
parse_mofid(content['mofid'])

([LigandDescription(smiles='[O-]C(=O)c1cc(F)c(c(c1F)F)C(=O)[O-]', fragment_atoms=None),
  LigandDescription(smiles='[O-]C(=O)c1cc(F)c(cc1F)C(=O)[O-]', fragment_atoms=None),
  LigandDescription(smiles='[O-]C(=O)c1ccc(c(c1)F)C(=O)[O-]', fragment_atoms=None)],
 NodeDescription(smiles='[Zn][O]([Zn])([Zn])[Zn]'),
 'pcu',
 0)

That function will let us build the whole record

In [6]:
def assemble_record(hmof_record: dict) -> MOFRecord:
    """Convert the hMOF-format data into the one used by our workflow
    
    Args:
        hmof_record: Contents of the hMOF json document
    Returns:
        Data converted into the format used by our workflow
    """
    
    # Start by parsing the MOFid
    ligands, node, topology, cat = parse_mofid(content['mofid'])
    
    # Convert the CIF-format structure into XYZ
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        atoms = read(StringIO(content['cif']), format='cif')
        fp = StringIO()
        atoms.write(fp, 'vasp')
        strc = fp.getvalue()
    
    # Assemble everything into a record
    record = MOFRecord(
        identifiers=dict(name=content['name']),
        topology=topology,
        catenation=cat,
        nodes=[node],
        ligands=ligands,
        structure=strc,
    )
    
    # TODO (wardlt): Save the isotherm information
    return record
assemble_record(content)

MOFRecord(identifiers={'name': 'hMOF-6'}, topology='pcu', catenation=0, nodes=[NodeDescription(smiles='[Zn][O]([Zn])([Zn])[Zn]')], ligands=[LigandDescription(smiles='[O-]C(=O)c1cc(F)c(c(c1F)F)C(=O)[O-]', fragment_atoms=None), LigandDescription(smiles='[O-]C(=O)c1cc(F)c(cc1F)C(=O)[O-]', fragment_atoms=None), LigandDescription(smiles='[O-]C(=O)c1ccc(c(c1)F)C(=O)[O-]', fragment_atoms=None)])

## Run for Whole Dataset
Pretty easy, just apply that function over all JSONs in the ZIP file.

In [7]:
succeeded = 0
with ZipFile(hmof_path) as zp, gzip.open(out_path, 'wt') as fo:
    for info in tqdm(zp.infolist()):
        # Get only the JSON files
        if not info.filename.endswith('json'):
            continue
        
        # Parse and print into new format
        content = json.loads(zp.read(info).decode())
        if content.get('mofid') is not None:
            try:
                record = assemble_record(content)
                print(record.to_json(), file=fo)
                succeeded += 1
            finally:
                continue
                
print(f'Succeeded in parsing {succeeded} MOFs')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 275304/275304 [35:42<00:00, 128.51it/s]

Succeeded in parsing 114436 MOFs



