## Convert the ase atoms of QM9 to rdkit molecules and generate InChIs

In [None]:
import os
import json
import ase
import ase.db.sqlite

In [None]:
from utils.conversions import xyz2rdkit

In [None]:
import tempfile
import rdkit

In [None]:
# Same as utils.conversions.xyz2rdkit, but vary use_huckel parameter
import xyz2mol
def xyz2rdkit(src_file: str, use_huckel=False):
    atoms, charge, xyz_coordinates = xyz2mol.read_xyz_file(src_file)
    mols = xyz2mol.xyz2mol(atoms, xyz_coordinates,
        charge=charge,
        use_graph=use_graph,
        allow_charged_fragments=True,
        embed_chiral=True,
        use_huckel=use_huckel)
    assert len(mols) == 1
    return mols[0]

In [None]:
db = ase.db.connect('data/qm9.db')
inchis = {}
success = 0
failed = []
for idx in range(0, len(db)):
    ase_idx = idx + 1
    for row in db.select(id=int(ase_idx)):
        atoms = row.toatoms()
        
        tmpdir = tempfile.TemporaryDirectory()
        xyzfile = os.path.join(tmpdir.name, 'test.xyz')
        ase.io.write(xyzfile, atoms)
        try:
            try:
                rmol = xyz2rdkit(xyzfile, use_huckel=False)
                # This fails for some 18 molecules.
            except:
                print(idx, ': try with use_huckel=True...', end='')
                rmol = xyz2rdkit(xyzfile, use_huckel=True)
                # This fails only for 6 molecules out of the above 18 molecules,
                print('success!')
            inchi = rdkit.Chem.rdinchi.MolToInchi(rmol)[0]
            inchis[idx] = inchi
            success += 1
        except:
            failed.append(idx)
            print("Failed on id {}.".format(idx))
        if idx % 100 == 0:
            print("progress: {:.1f}%".format(100*idx/len(db)), end="\r")
        if success < 0:
            break
    else:
        # Continue if the inner loop wasn't broken.
        continue
    # Inner loop was broken, break the outer.
    break
    
success, len(failed)

In [None]:
# Dump the dictionary idx -> InChI
inchi_file = 'data/qm9-inchis.json'
with open(inchi_file, 'w', encoding='utf-8') as f:
    json.dump(inchis, f, indent=0)

In [None]:
# Dump the list of idx where no InChI could be obtained
with open('data/qm9-inchi-failed.json', 'w', encoding='utf-8') as f:
    json.dump(failed, f, indent=0)

## View the failed molecules

In [None]:
failedats = []
for idx in failed:
    print(idx)
    ase_idx = idx + 1
    row = db.select(id=int(ase_idx))
    failedats.append(next(row).toatoms())

In [None]:
from ase.visualize import view
amol = failedats[3]
view(amol, viewer='x3d')

In [None]:
tmpdir = tempfile.TemporaryDirectory()
xyzfile = os.path.join(tmpdir.name, 'test.xyz')
ase.io.write(xyzfile, amol)
rmol = xyz2rdkit(xyzfile, use_huckel=False)
inchi = rdkit.Chem.rdinchi.MolToInchi(rmol)[0]