In [1]:
import sys
sys.path.insert(0, '/home/misa/git_repositories/xyz2mol_modified/')

import xyz2mol

from rdkit import Chem
import glob

def get_xyz_list(filepath):
    """
    make list where every item is the xyz data of one amon
    """
    start_index = []
    file = []
    with open(filepath, 'r') as f:
        for i, line in enumerate(f):
            if line.strip('\n').isdigit():
                start_index.append(i)
            file.append(line.strip('\n'))

    xyz = []

    for i in range(len(start_index)):
        if i < len(start_index) - 1:
            xyz.append(file[start_index[i]:start_index[i+1]])
        else:
            xyz.append(file[start_index[i]:])
    return(xyz)
        

def generate_smiles(xyz_data):
    """
    make smiles from xyz list
    """
    # read atoms and coordinates. Try to find the charge
    atoms, charge, xyz_coordinates = xyz2mol.read_xyz_data(xyz_data)

    # huckel uses extended Huckel bond orders to locate bonds (requires RDKit 2019.9.1 or later)
    # otherwise van der Waals radii are used
    use_huckel = True#args.use_huckel

    # if explicit charge from args, set it
    # if args.charge is not None:
    #     charge = int(args.charge)

    # Get the molobjs
    mols = xyz2mol.xyz2mol(atoms, xyz_coordinates,
        charge=charge,
        use_graph=True,
        allow_charged_fragments=True,
        embed_chiral=False,
        use_huckel=False)

    assert len(mols) == 1, print('Mols not 1')

    # Canonical hack
    isomeric_smiles = True
    smiles = Chem.MolToSmiles(mols[0], isomericSmiles=isomeric_smiles)
    m = Chem.MolFromSmiles(smiles)
    smiles = Chem.MolToSmiles(m, isomericSmiles=isomeric_smiles)

    return(smiles)

In [2]:
amons_paths = glob.glob('/home/misa/git_repositories/aqml-data/qm9_11k/*_amons.xyz')
amons_paths.sort()
print(amons_paths)

smiles_dict = dict()

for filepath in amons_paths:
#filepath = '/home/misa/git_repositories/aqml-data/qm9_11k/01_amons.xyz'
    xyz_list = get_xyz_list(filepath)
    for xyz in xyz_list:
        smiles = generate_smiles(xyz)
        smiles_dict[smiles] = xyz

['/home/misa/git_repositories/aqml-data/qm9_11k/01_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/02_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/03_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/04_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/05_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/06_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/07_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/08_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/09_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/10_amons.xyz', '/home/misa/git_repositories/aqml-data/qm9_11k/11_amons.xyz']


In [3]:
len(smiles_dict)

14354

In [6]:
sys.path.insert(0, '/home/misa/git_repositories/APDFT/prototyping/atomic_energies/')
import utils_qm as uqm

uqm.save_obj(smiles_dict, '/home/misa/datasets/amons_qm9_11k/unique_amons_dict')
