In [None]:
import json
from rdkit import Chem

from helper_functions_csd_search import (check_bonds, 
                                         check_neighbors, 
                                         check_unpaired_electrons, 
                                         check_isotope, 
                                         get_picks)

from rdkit import RDLogger   
RDLogger.DisableLog('rdApp.*')

import warnings
warnings.filterwarnings('ignore')

# 1) Filter data

In [None]:
# Load the extracted raw data

with open("results_la_csd_search.json", "r") as f:
    results_la = json.load(f)

In [None]:
# Do the filtering

all_rdkit_smiles = []

counter_pass = 0
for central_atom in results_la:
    s = central_atom.split("_")
    central_atom_symbol = s[0]
    neighbor_count = int(s[1])
    
    for identifier in results_la[central_atom]:
        error = "pass"
        mol = Chem.MolFromSmiles(results_la[central_atom][identifier]["processed_smiles"])
        
        if mol:
            mol = Chem.AddHs(mol)
            atom_count = len(mol.GetAtoms())
            rdkit_smiles = Chem.MolToSmiles(mol)

            # Check for duplications
            if rdkit_smiles in all_rdkit_smiles:
                error = "This is a duplication."
            else:
                all_rdkit_smiles.append(rdkit_smiles)
            
            # Check for unpaired electrons
            message = check_unpaired_electrons(mol, central_atom_symbol)
            if message != "pass":
                error = message
            
            # Check for triple bonds
            for bond in mol.GetBonds():
                if bond.GetBondTypeAsDouble() > 2:
                    error = "Bond order > 2 contained."
                    break
            
            # Check for hetero_atom-hetero_atom bonds
            if check_bonds(mol):
                error = "Hetero atom - hetero atom bond contained."
            
            # Check if desired amount of atoms is around central atom
            reject, found_neighbor_count = check_neighbors(mol, neighbor_count, central_atom_symbol)
            if reject:
                error = f"Central atom '{central_atom_symbol}' has wrong amount of neighbors (should: {neighbor_count}, is: {found_neighbor_count})"
            
            # Check for isotopes
            if check_isotope(mol):
                error = "Unwanted isotopes contained."
                
            # Check number of atoms
            if atom_count > 120:
                error = "This molecule is too large."
            
        else:
            error = "Smiles cannt be converted to an rdkit mol object."
            rdkit_smiles = None
            atom_count = None
        
        if error == "pass":
            counter_pass += 1

        results_la[central_atom][identifier]["status"] = error
        results_la[central_atom][identifier]["rdkit_smiles"] = rdkit_smiles
        results_la[central_atom][identifier]["atom_count"] = atom_count

counter_pass

In [None]:
# Save results.

with open("results_la_csd_search.json", "w") as f:
    json.dump(results_la, f, indent=4)

# 2) Pick data points

In [None]:
with open("results_la_csd_search.json", "r") as f:
    results_la = json.load(f)

In [None]:
N_SAMPLE_PER_CA = 500

for central_atom  in results_la:
    print(f"Working on {central_atom} ...")
    subset = {identifier: results_la[central_atom][identifier] 
          for identifier in results_la[central_atom] 
          if results_la[central_atom][identifier]["status"] == "pass"
          # lone pairs at group 14 atoms (e.g. Si(II)) are defined as unpaired electrons within RDKit
          # this should not lead to an exlusion of this molecule
          or results_la[central_atom][identifier]["status"] == "Unpaired electron(s) at central atom contained."
         }
    
    if len(subset) > N_SAMPLE_PER_CA:
        print("picking a subset ...")
        picks = get_picks(subset, n_picks=N_SAMPLE_PER_CA)
    else:
        print("picking all ...")
        picks = list(subset.keys())
        
    for identifier in results_la[central_atom]:
        if identifier in picks and identifier in subset:
            results_la[central_atom][identifier]["picked"] = True
        else:
            results_la[central_atom][identifier]["picked"] = False
    
    print(f"{len(picks)} compounds were picked.")
    print()

In [None]:
with open("results_la_csd_search.json", "w") as f:
    json.dump(results_la, f, indent=4)

# 3) Embedding 

In [None]:
import os
import glob
import json
from ccdc import io
from ccdc import conformer
from ccdc.molecule import Molecule

In [None]:
with open("results_la_csd_search.json", "r") as f:
    results_la = json.load(f)

In [None]:
for central_atom in results_la:
    print(f"Working on {central_atom} ...")
    for identifier in results_la[central_atom]:
        if results_la[central_atom][identifier]["picked"]:
            name = f"{central_atom}_CSD-{identifier}"
            
            if not os.path.isfile(os.path.join("xyz_out", f"{name}.mol")):
                conformer_generator = conformer.ConformerGenerator()
                conformer_generator.settings.max_conformers = 1
                mol = Molecule.from_string(results_la[central_atom][identifier]["rdkit_smiles"])
                conformers = conformer_generator.generate(mol)

                if conformers[0].molecule.all_atoms_have_sites:
                    with io.EntryWriter(os.path.join("xyz_out", f"{name}.mol")) as f:
                        f.write(conformers[0].molecule)
                else:
                    print(f"Embedding of {name} failed.")
    print()

In [None]:
all_successful = []
for file in glob.glob(os.path.join("xyz_out", "*.mol")):
    name = os.path.basename(file).split(".")[0]
    identifier = name.split("-")[-1]
    all_successful.append(identifier)

In [None]:
for central_atom in results_la:
    for identifier in results_la[central_atom]:
        if results_la[central_atom][identifier]["picked"] is True:
            if identifier in all_successful:
                results_la[central_atom][identifier]["la_embedded"] = True
            else:
                results_la[central_atom][identifier]["la_embedded"] = False

In [None]:
with open("results_la_csd_search.json", "w") as f:
    json.dump(results_la, f, indent=4)