In [None]:
import re
import json
from ccdc.search import SubstructureSearch, SMARTSSubstructure  # 3.0.14

In [None]:
def search_csd_la(central_atom, valency):
    """
    Search the CSD for molecules with a given central atom (e.g. "B") and valency (e.g., 3).
    This is done with a SubstructureSearch based on a SMARTS string of the form
    f"[{central_atom}X{valency}v{valency}]-[#6,#7,#8,#16]".
    """
    
    def filter_search(molecule, central_atom):
        """
        Filter the hits found in the CSD.
        """
        # Remove polymers
        if molecule.is_polymeric:
            return None

        # Remove molecules with formal charges
        if molecule.has_charged_atoms:
            return None

        # Pick heaviest component to get rid of solvent molecules. If the LA is not heavier than
        # the solvent molecule it is lost. This is ignored so far.
        heaviest_component = molecule.heaviest_component

        split = re.split(r"(\d+)", heaviest_component.formula)
        if split[-1] == "":
            split.pop(-1)

        allowed_ligand_atoms = ["H", "C", "N", "O", "S", "F", "Cl", "Br", "I"]

        atom_counts = {}
        for idx in range(0, len(split), 2):
            atom_counts[split[idx].strip()] = int(split[idx + 1])

        # Check if desired central atom is not contained
        if central_atom not in atom_counts:
            return None

        # Check if desired central atom is contained more than once
        if atom_counts[central_atom] != 1:
            return None

        # Check for unwanted atoms in the ligand(s)
        atom_counts.pop(central_atom)
        for atom in atom_counts:
            if atom not in allowed_ligand_atoms:
                return None

        # Try to get smiles
        try:
            smiles = heaviest_component.to_string(format="smiles")
            return smiles
        except:
            return None

    substructure = SMARTSSubstructure(
        f"[{central_atom}X{valency}v{valency}]-[#6,#7,#8,#16]"
    )

    substructure_search = SubstructureSearch()
    _ = substructure_search.add_substructure(substructure)
    hits = [
        hit.molecule for hit in substructure_search.search(max_hits_per_structure=1)
    ]

    all_hits = {}
    previous_smiles = []
    for hit in hits:
        processed_smiles = filter_search(hit, central_atom)
        if processed_smiles:
            if processed_smiles not in previous_smiles:
                previous_smiles.append(processed_smiles)
                all_hits[hit.identifier] = {"processed_smiles": processed_smiles}

    print(f"Hits: {len(all_hits)}")
    return all_hits

In [None]:
elements = [
    ("B", 3), ("Al", 3), ("Ga", 3), ("In", 3),
    ("Si", 2), ("Ge", 2), ("Sn", 2), ("Pb", 2),
    ("Si", 4), ("Ge", 4), ("Sn", 4), ("Pb", 4),
    ("P", 3), ("As", 3), ("Sb", 3), ("Bi", 3), 
    ("P", 5), ("As", 5), ("Sb", 5), ("Bi", 5),
    ("Te", 4)
]
RESULTS = {}

for el_data in elements:
    print(f"Working on {el_data[0]}_{el_data[1]} ...")
    results = search_csd_la(el_data[0], el_data[1])
    RESULTS[f"{el_data[0]}_{el_data[1]}"] = results
    print("Done")
    print()

In [None]:
'''
Output of the above cell.

Working on B_3 ...
Hits: 2801
Done

Working on Al_3 ...
Hits: 55
Done

Working on Ga_3 ...
Hits: 68
Done

Working on In_3 ...
Hits: 25
Done

Working on Si_2 ...
Hits: 14
Done

Working on Ge_2 ...
Hits: 85
Done

Working on Sn_2 ...
Hits: 57
Done

Working on Pb_2 ...
Hits: 35
Done

Working on Si_4 ...
Hits: 6919
Done

Working on Ge_4 ...
Hits: 563
Done

Working on Sn_4 ...
Hits: 1370
Done

Working on Pb_4 ...
Hits: 155
Done

Working on P_3 ...
Hits: 2120
Done

Working on As_3 ...
Hits: 296
Done

Working on Sb_3 ...
Hits: 232
Done

Working on Bi_3 ...
Hits: 187
Done

Working on P_5 ...
Hits: 537
Done

Working on As_5 ...
Hits: 63
Done

Working on Sb_5 ...
Hits: 680
Done

Working on Bi_5 ...
Hits: 226
Done

Working on Te_4 ...
Hits: 481
Done

'''

In [None]:
with open("results_la_csd_search.json", "w") as f:
    json.dump(RESULTS, f, indent=4)