# Exploring conformational space of selected macrocycles - "m7"; <br /> Part 1: Generation and selection of conformer candidates (MM methods)

In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [2]:
import glob
import py3Dmol

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
%matplotlib inline 

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdMolAlign
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
print(rdBase.rdkitVersion)
import os,time
print( time.asctime())

2016.09.4
Fri Jun 16 14:13:44 2017


In [3]:
# Functions used in this notebook:

def grep_energies_from_sdf_outputs(files):
    energies = {}
    for inp in files:
        with open(inp,'r') as f:
            lines = f.readlines()
            for i, line in enumerate(lines):
                if "M  END" in line:
                    energies[os.path.splitext(os.path.basename(inp))[0]] = float(lines[i+1])
    return energies

def write_to_dict(prefix, suppl):
    moldict = {}
    for i, mol in enumerate(suppl):
        name = prefix + str(i)
        moldict[name] = mol
    return moldict

def align_structures_to_crystal(moldict):
    for key, mol in moldict.items():
        core_mol = mol.GetSubstructMatch(Chem.MolFromSmiles(core_smiles))
        AllChem.AlignMol(mol,m7_crystal,atomMap=list(zip(core_mol,core_m7)))

def align_structures_to_lowest_energy(moldict, energy_dict):
    """
    align structures from the structures' dictionary
    
    note that the dictionary in python is not ordered, so the "first" element is arbitrary
    """
    energy_sorted = sorted(energy_dict.items(), key=lambda x: x[1])
    first = energy_sorted[0][0]
    core_first = moldict[first].GetSubstructMatch(Chem.MolFromSmiles(core_smiles))
    
    for key, mol in moldict.items():
        core_mol = mol.GetSubstructMatch(Chem.MolFromSmiles(core_smiles))
        AllChem.AlignMol(mol,moldict[first],atomMap=list(zip(core_mol,core_first)))        

def prepare_view(moldict):
    p = py3Dmol.view(width=400,height=400)
    for key, mol in moldict.items():
        mb = Chem.MolToMolBlock(mol)
        p.addModel(mb,'sdf')
    p.setStyle({'stick':{'radius':'0.15'}})
    p.setBackgroundColor('0xeeeeee')
    p.zoomTo()
    return p        

def make_similarity_matrix(moldict):
    
    similarity_matrix = {}

    for k1, m1 in moldict.items():
        for k2, m2 in moldict.items():
            if (k1, k2) in similarity_matrix.keys() or (k2, k1) in similarity_matrix.keys():
                pass
            else:
                if k1 != k2:
                    #rms = AllChem.GetBestRMS(Chem.RemoveHs(m1),Chem.RemoveHs(m2))
                    #rms = rdMolAlign.AlignMol(Chem.RemoveHs(m1),Chem.RemoveHs(m2))
                    pyO3A = rdMolAlign.GetO3A(m1, m2)
                    rms = pyO3A.Align()
                    similarity_matrix[(k1, k2)] = rms
                    
    return similarity_matrix


def find_duplicates(rms_sorted, energy, rms_thresh):
    i = 0
    to_be_deleted = []
    while i < len(rms_sorted):
        j = i + 1
        while j < len(rms_sorted):
            if rms_sorted[i][0] in to_be_deleted:
                i = i + 1
                j = j + 1
            elif rms_sorted[j][0] in to_be_deleted:
                j = j + 1
            else:
                rms1 = rms_sorted[i][1]
                rms2 = rms_sorted[j][1]
                if (rms2 - rms1) < rms_thresh:
                    if energy[rms_sorted[i][0]] < energy[rms_sorted[j][0]]:
                        to_be_deleted.append(rms_sorted[j][0])
                    else:
                        to_be_deleted.append(rms_sorted[i][0])
                else:
                    break
        i = i + 1
    if to_be_deleted:
        print("Conformers which will be deleted:")    
        print(to_be_deleted)
    return to_be_deleted

## Crystal structure of "m7" macrocycle

In [4]:
cm7 = open('/home/gosia/work/work_on_gitlab/icho/calcs/m7/m7_crystal.xyz','r').read()
vcm7 = py3Dmol.view(width=400,height=400)
vcm7.removeAllModels()
vcm7.addModel(cm7,'xyz')
vcm7.setStyle({'stick':{'radius':0.15,'color':'spectrum'}})
vcm7.setBackgroundColor('0xeeeeee')
vcm7.zoomTo()
vcm7.show()

In [5]:
# "core" is a part of a molecule, which we wish to be the "most-aligned" among multiple conformers
smiles      = 'N1C(=O)c2nc(C(=O)NCCCNC(=O)c3nc(C(=O)NCCC1)ccc3)ccc2'
core_smiles = 'C(=O)c1nc(C=O)ccc1'

m7 = Chem.AddHs(Chem.MolFromSmiles(smiles))
core_m7 = m7.GetSubstructMatch(Chem.MolFromSmiles(core_smiles))

templ_m7 = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/m7_crystal.sdf')
m7_crystal = templ_m7[0]

## Conformers generated with the Balloon software:

Conformers were generated using the genetic algorithm as implemented in the Balloon software:

* starting with the crystal geometry kept as a template, results with prefix: "m7_b_sdf"; the crystal is of the "ss-ss" type;

* starting with the SMILES signature of m7 and allowing to "rebuild the geometry" (option --rebuildGeometry), results with prefix: "m7_b_smi"

* starting with structures generated in Avogadro (from the crystal geometry and pre-optimized) of the:
    * "ss_sa" type
    * "ss_aa" type
    * "sa_sa" type
    * "sa_as" type
    * "sa_aa" type
    * "aa_aa" type    

    where "ss\_sa" means "(syn-syn)\_(syn-anti)" configuration, etc. with the bracket notation used to mark conformations around the rings.
    

In all cases the Balloon software was asked to generate 100 conformers using the genertic algorithm with default settings (only "maxPostprocessIter" increased to 150 and "nGenerations" to 300).

In [6]:
inps_m7_b_sdf = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/results_starting_from_crystalsdf/*.sdf')
inps_m7_b_smi = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/results_starting_from_crystalsmiles/*.sdf')
inps_m7_b_ss_sa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/results_starting_from_m7_ss_sa/*.sdf')
inps_m7_b_ss_aa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/results_starting_from_m7_ss_aa/*.sdf')
inps_m7_b_sa_sa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/results_starting_from_m7_sa_sa/*.sdf')
inps_m7_b_sa_as = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/results_starting_from_m7_sa_as/*.sdf')
inps_m7_b_sa_aa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/results_starting_from_m7_sa_aa/*.sdf')
inps_m7_b_aa_aa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/results_starting_from_m7_aa_aa/*.sdf')

In [7]:
e_m7_b_sdf = grep_energies_from_sdf_outputs(inps_m7_b_sdf)
e_m7_b_smi = grep_energies_from_sdf_outputs(inps_m7_b_smi)
e_m7_b_ss_sa = grep_energies_from_sdf_outputs(inps_m7_b_ss_sa)
e_m7_b_ss_aa = grep_energies_from_sdf_outputs(inps_m7_b_ss_aa)
e_m7_b_sa_sa = grep_energies_from_sdf_outputs(inps_m7_b_sa_sa)
e_m7_b_sa_as = grep_energies_from_sdf_outputs(inps_m7_b_sa_as)
e_m7_b_sa_aa = grep_energies_from_sdf_outputs(inps_m7_b_sa_aa)
e_m7_b_aa_aa = grep_energies_from_sdf_outputs(inps_m7_b_aa_aa)

In [8]:
# write conformers to dictionaries
   
suppl_m7_b_sdf  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/m7_crystal_sdfout.sdf')
suppl_m7_b_smi  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/m7_crystal_smilesout.sdf')
suppl_m7_b_ss_sa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/m7_ss_sa_sdfout.sdf')
suppl_m7_b_ss_aa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/m7_ss_aa_sdfout.sdf')
suppl_m7_b_sa_sa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/m7_sa_sa_sdfout.sdf')
suppl_m7_b_sa_as  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/m7_sa_as_sdfout.sdf')
suppl_m7_b_sa_aa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/m7_sa_aa_sdfout.sdf')
suppl_m7_b_aa_aa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/balloon/m7_aa_aa_sdfout.sdf')

allmol_m7_b_sdf   = write_to_dict("m7_b_sdf_", suppl_m7_b_sdf)
allmol_m7_b_smi   = write_to_dict("m7_b_smi_", suppl_m7_b_smi)
allmol_m7_b_ss_sa = write_to_dict("m7_b_ss_sa_", suppl_m7_b_ss_sa)
allmol_m7_b_ss_aa = write_to_dict("m7_b_ss_aa_", suppl_m7_b_ss_aa)
allmol_m7_b_sa_sa = write_to_dict("m7_b_sa_sa_", suppl_m7_b_sa_sa)
allmol_m7_b_sa_as = write_to_dict("m7_b_sa_as_", suppl_m7_b_sa_as)
allmol_m7_b_sa_aa = write_to_dict("m7_b_sa_aa_", suppl_m7_b_sa_aa)
allmol_m7_b_aa_aa = write_to_dict("m7_b_aa_aa_", suppl_m7_b_aa_aa)

The generated conformer structures are presented in a separate notebook: [link](http://nbviewer.jupyter.org/github/gosiao/icho-notebooks/blob/master/conformers_m7_suppl1.ipynb).

### pre-screening

All the generated structures are pre-optimized with MM methods (MMFF94-like force field). To remove potential duplicates, we will:

* calculate the root-mean-square-distance (RMSD) between the pairs of conformers (taking into account the heavy atoms only);
* compare the energies of conformers from pairs which are found to be similar (RMSD lower than the threshold);
* if these energies are too similar (the difference lower than the threshold), we will remove the conformer which has the higher energy value;

In [9]:
allmol_m7_b = {}
allmol_m7_b.update(allmol_m7_b_sdf)
allmol_m7_b.update(allmol_m7_b_smi)
allmol_m7_b.update(allmol_m7_b_ss_sa)
allmol_m7_b.update(allmol_m7_b_ss_aa)
allmol_m7_b.update(allmol_m7_b_sa_sa)
allmol_m7_b.update(allmol_m7_b_sa_as)
allmol_m7_b.update(allmol_m7_b_sa_aa)
allmol_m7_b.update(allmol_m7_b_aa_aa)

print("The total number of generated conformers = ", len(allmol_m7_b))
             
energy_m7_b = {}
energy_m7_b.update(e_m7_b_sdf)
energy_m7_b.update(e_m7_b_smi)
energy_m7_b.update(e_m7_b_ss_sa)
energy_m7_b.update(e_m7_b_ss_aa)
energy_m7_b.update(e_m7_b_sa_sa)
energy_m7_b.update(e_m7_b_sa_as)
energy_m7_b.update(e_m7_b_sa_aa)
energy_m7_b.update(e_m7_b_aa_aa)

The total number of generated conformers =  144


In [10]:
# 1. calculate the similarity matrix between all pairs of conformers and sort its elements from the lowest
# (the most similar structures) to the largest values (the most different structures)
similarity_matrix_b = make_similarity_matrix(allmol_m7_b)
similarity_matrix_b_sorted = sorted(similarity_matrix_b.items(), key=lambda x: x[1])

# 2. remove duplicates:
# for all pairs of structures, for which the similarity value is lower than threshold ("similarity_thresh"), 
# compare energies; then if the energies are similar (controlled by the "energy_thresh"), 
#then remove the one with higher energy

to_be_deleted = []
similarity_thresh = 1.0 # Angstrom
energy_thresh     = 5 # kcal/mol (Balloon original threshold used for conformer generation is 25 kcal/mol)

for pair in similarity_matrix_b_sorted:
    if pair[1] < similarity_thresh:
        conf1 = pair[0][0]
        conf2 = pair[0][1]
        if abs(energy_m7_b[conf1] - energy_m7_b[conf2]) < energy_thresh:
            #print("conf1, conf2, E(conf1), E(conf2) = ", conf1, conf2, energy_m7_b[conf1], energy_m7_b[conf2])
            if energy_m7_b[conf1] < energy_m7_b[conf2]:
                to_be_deleted.append(conf2)
            else:
                to_be_deleted.append(conf1)

for mol in to_be_deleted:
    print("to_be_deleted: ", mol)
    to_be_deleted_keys = list(k for k in similarity_matrix_b.keys() if mol in k)
    for k in to_be_deleted_keys:
        del similarity_matrix_b[k]
    allmol_m7_b.pop(mol, None)
    energy_m7_b.pop(mol, None)
    

to_be_deleted:  m7_b_sdf_11
to_be_deleted:  m7_b_ss_aa_5
to_be_deleted:  m7_b_smi_7
to_be_deleted:  m7_b_sdf_9
to_be_deleted:  m7_b_ss_aa_0
to_be_deleted:  m7_b_sdf_17
to_be_deleted:  m7_b_ss_sa_10
to_be_deleted:  m7_b_ss_sa_5
to_be_deleted:  m7_b_sdf_9
to_be_deleted:  m7_b_sdf_8
to_be_deleted:  m7_b_sa_aa_0
to_be_deleted:  m7_b_sa_aa_8
to_be_deleted:  m7_b_ss_aa_9
to_be_deleted:  m7_b_sdf_19
to_be_deleted:  m7_b_sdf_14
to_be_deleted:  m7_b_sdf_14
to_be_deleted:  m7_b_ss_aa_13
to_be_deleted:  m7_b_smi_7
to_be_deleted:  m7_b_smi_7
to_be_deleted:  m7_b_ss_sa_12
to_be_deleted:  m7_b_aa_aa_15
to_be_deleted:  m7_b_sa_as_4
to_be_deleted:  m7_b_sa_as_8
to_be_deleted:  m7_b_sa_as_7
to_be_deleted:  m7_b_sa_as_3
to_be_deleted:  m7_b_smi_7
to_be_deleted:  m7_b_ss_aa_7
to_be_deleted:  m7_b_sa_sa_8
to_be_deleted:  m7_b_sa_sa_1
to_be_deleted:  m7_b_sa_aa_5
to_be_deleted:  m7_b_sdf_19
to_be_deleted:  m7_b_ss_sa_0
to_be_deleted:  m7_b_sdf_19
to_be_deleted:  m7_b_sdf_17
to_be_deleted:  m7_b_smi_3
to_be

In [11]:
print("We have removed the potential conformer duplicates.")
print("The final number of remaining conformers = ", len(allmol_m7_b))
print("Below we present all the remaining conformers aligned (to one ring).")

We have removed the potential conformer duplicates.
The final number of remaining conformers =  1
Below we present all the remaining conformers aligned (to one ring).


In [12]:
align_structures_to_lowest_energy(allmol_m7_b, energy_m7_b)
p_b = prepare_view(allmol_m7_b)
p_b.show()

In [13]:
print("Sorted energy of all selected conformers and the energy differences with respect to the lowest:")
energy_sorted = sorted(energy_m7_b.items(), key=lambda x: x[1])
energy_diff = []
e_min = energy_sorted[0][1]
for e in energy_sorted:
    e_diff = e[1] - e_min
    energy_diff.append([e[0], e[1], e_diff])

e_df = pd.DataFrame(energy_diff, columns=["conformer", "E", "E - Emin"])
e_df

Sorted energy of all selected conformers and the energy differences with respect to the lowest:


Unnamed: 0,conformer,E,E - Emin
0,m7_b_aa_aa_0,38.647663,0.0


## Conformers generated with the RDKit software:

Conformers were generated using the distance geometry algorithm as implemented in the RDKit software:

* starting with the crystal geometry kept as a template, results with prefix: "m7_b_sdf"; the crystal is of the "ss-ss" type;

* starting with the SMILES signature of m7, results with prefix: "m7_b_smi"

* starting with structures generated in Avogadro (from the crystal geometry and pre-optimized) of the:
    * "ss_sa" type
    * "ss_aa" type
    * "sa_sa" type
    * "sa_as" type
    * "sa_aa" type
    * "aa_aa" type    

    where "ss\_sa" means "(syn-syn)\_(syn-anti)" configuration, etc. with the bracket notation used to mark conformations around the rings.


* the geometries of generated conformers were then pre-optimized with the MM methods (using UFF force field).

In all cases the RDKit software was asked to generate 100 conformers using the distnce geometry algorithm with default settings (only "pruneRmsThresh" set to 1.0 in "AllChem.EmbedMultipleConfs" and "maxIters" set to 500 in "AllChem.UFFOptimizeMolecule").

In [14]:
inps_m7_rdkit_smi = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/results_crystal_from_smiles/*.sdf')
inps_m7_rdkit_sdf = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/results_crystal_from_sdf/*.sdf')
inps_m7_rdkit_ss_sa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/results_from_m7_ss_sa/*.sdf')
inps_m7_rdkit_ss_aa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/results_from_m7_ss_aa/*.sdf')
inps_m7_rdkit_sa_sa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/results_from_m7_sa_sa/*.sdf')
inps_m7_rdkit_sa_as = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/results_from_m7_sa_as/*.sdf')
inps_m7_rdkit_sa_aa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/results_from_m7_sa_aa/*.sdf')
inps_m7_rdkit_aa_aa = glob.glob('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/results_from_m7_aa_aa/*.sdf')

In [15]:
e_m7_rdkit_smi = grep_energies_from_sdf_outputs(inps_m7_rdkit_smi)
e_m7_rdkit_sdf = grep_energies_from_sdf_outputs(inps_m7_rdkit_sdf)
e_m7_rdkit_ss_sa = grep_energies_from_sdf_outputs(inps_m7_rdkit_ss_sa)
e_m7_rdkit_ss_aa = grep_energies_from_sdf_outputs(inps_m7_rdkit_ss_aa)
e_m7_rdkit_sa_sa = grep_energies_from_sdf_outputs(inps_m7_rdkit_sa_sa)
e_m7_rdkit_sa_as = grep_energies_from_sdf_outputs(inps_m7_rdkit_sa_as)
e_m7_rdkit_sa_aa = grep_energies_from_sdf_outputs(inps_m7_rdkit_sa_aa)
e_m7_rdkit_aa_aa = grep_energies_from_sdf_outputs(inps_m7_rdkit_aa_aa)

In [16]:
# write conformers to dictionaries
suppl_m7_rdkit_smi  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/result_smiles.sdf')
suppl_m7_rdkit_sdf  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/result_sdf.sdf')
suppl_m7_rdkit_ss_sa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/result_m7_ss_sa.sdf')
suppl_m7_rdkit_ss_aa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/result_m7_ss_aa.sdf')
suppl_m7_rdkit_sa_sa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/result_m7_sa_sa.sdf')
suppl_m7_rdkit_sa_as  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/result_m7_sa_as.sdf')
suppl_m7_rdkit_sa_aa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/result_m7_sa_aa.sdf')
suppl_m7_rdkit_aa_aa  = Chem.SDMolSupplier('/home/gosia/work/work_on_gitlab/icho/calcs/m7/rdkit/result_m7_aa_aa.sdf')

allmol_m7_rdkit_smi = write_to_dict("m7_rdkit_smi_", suppl_m7_rdkit_smi)
allmol_m7_rdkit_sdf = write_to_dict("m7_rdkit_sdf_", suppl_m7_rdkit_sdf)
allmol_m7_rdkit_ss_sa = write_to_dict("m7_rdkit_ss_sa_", suppl_m7_rdkit_ss_sa)
allmol_m7_rdkit_ss_aa = write_to_dict("m7_rdkit_ss_aa_", suppl_m7_rdkit_ss_aa)
allmol_m7_rdkit_sa_sa = write_to_dict("m7_rdkit_sa_sa_", suppl_m7_rdkit_sa_sa)
allmol_m7_rdkit_sa_as = write_to_dict("m7_rdkit_sa_as_", suppl_m7_rdkit_sa_as)
allmol_m7_rdkit_sa_aa = write_to_dict("m7_rdkit_sa_aa_", suppl_m7_rdkit_sa_aa)
allmol_m7_rdkit_aa_aa = write_to_dict("m7_rdkit_aa_aa_", suppl_m7_rdkit_aa_aa)

### pre-screening

All the generated structures are pre-optimized with MM methods (UFF force field). To remove potential duplicates, we will:

* calculate the root-mean-square-distance (RMSD) between the pairs of conformers (taking into account the heavy atoms only);
* compare the energies of conformers from pairs which are found to be similar (RMSD lower than the threshold);
* if these energies are too similar (the difference lower than the threshold), we will remove the conformer which has the higher energy value;

In [None]:
allmol_m7_rdkit = {}
allmol_m7_rdkit.update(allmol_m7_rdkit_sdf)
allmol_m7_rdkit.update(allmol_m7_rdkit_smi)
allmol_m7_rdkit.update(allmol_m7_rdkit_ss_sa)
allmol_m7_rdkit.update(allmol_m7_rdkit_ss_aa)
allmol_m7_rdkit.update(allmol_m7_rdkit_sa_sa)
allmol_m7_rdkit.update(allmol_m7_rdkit_sa_as)
allmol_m7_rdkit.update(allmol_m7_rdkit_sa_aa)
allmol_m7_rdkit.update(allmol_m7_rdkit_aa_aa)

print("The total number of generated conformers = ", len(allmol_m7_rdkit))
             
energy_m7_rdkit = {}
energy_m7_rdkit.update(e_m7_rdkit_sdf)
energy_m7_rdkit.update(e_m7_rdkit_smi)
energy_m7_rdkit.update(e_m7_rdkit_ss_sa)
energy_m7_rdkit.update(e_m7_rdkit_ss_aa)
energy_m7_rdkit.update(e_m7_rdkit_sa_sa)
energy_m7_rdkit.update(e_m7_rdkit_sa_as)
energy_m7_rdkit.update(e_m7_rdkit_sa_aa)
energy_m7_rdkit.update(e_m7_rdkit_aa_aa)

The total number of generated conformers =  659


In [None]:
# 1. calculate the similarity matrix between all pairs of conformers and sort its elements from the lowest
# (the most similar structures) to the largest values (the most different structures)
similarity_matrix_rdkit = make_similarity_matrix(allmol_m7_rdkit)
similarity_matrix_rdkit_sorted = sorted(similarity_matrix_rdkit.items(), key=lambda x: x[1])

# 2. remove duplicates:
# for all pairs of structures, for which the similarity value is lower than threshold ("similarity_thresh"), 
# compare energies; then if the energies are similar (controlled by the "energy_thresh"), 
#then remove the one with higher energy

to_be_deleted = []
similarity_thresh = 1.0 # Angstrom 
energy_thresh     = 5 # kcal/mol 

for pair in similarity_matrix_rdkit_sorted:
    if pair[1] < similarity_thresh:
        conf1 = pair[0][0]
        conf2 = pair[0][1]
        if abs(energy_m7_rdkit[conf1] - energy_m7_rdkit[conf2]) < energy_thresh:
            #print("conf1, conf2, E(conf1), E(conf2) = ", conf1, conf2, energy_m7_b[conf1], energy_m7_b[conf2])
            if energy_m7_rdkit[conf1] < energy_m7_rdkit[conf2]:
                to_be_deleted.append(conf2)
            else:
                to_be_deleted.append(conf1)

for mol in to_be_deleted:
    print("to_be_deleted: ", mol)
    to_be_deleted_keys = list(k for k in similarity_matrix_rdkit.keys() if mol in k)
    for k in to_be_deleted_keys:
        del similarity_matrix_rdkit[k]
    allmol_m7_rdkit.pop(mol, None)
    energy_m7_rdkit.pop(mol, None) 

In [None]:
print("We have removed the potential conformer duplicates.")
print("The final number of remaining conformers = ", len(allmol_m7_rdkit))
print("Below we present all the remaining conformers aligned (to one ring).")

In [None]:
align_structures_to_lowest_energy(allmol_m7_rdkit, energy_m7_rdkit)
p_rdkit = prepare_view(allmol_m7_rdkit)
p_rdkit.show()

## Summary

Now let's generate a list of all conformers (from all programs used).
We will not further pre-screen the structures, as different programs use different force fields and the comparison of structures based on RMSD without the comparison of energies might lead to the removal of promising candidates.

In [None]:
allmol_m7 = {}
allmol_m7.update(allmol_m7_b)
allmol_m7.update(allmol_m7_rdkit)

energy_m7 = {}
energy_m7.update(energy_m7_b)
energy_m7.update(energy_m7_rdkit)

Finally we can align all conformers which will further be used as starting points in DFT geometry optimizations; the total number of conformers is:

In [None]:
print(len(allmol_m7))

In [None]:
align_structures_to_lowest_energy(allmol_m7, energy_m7)
p_all = prepare_view(allmol_m7)
p_all.show()

Write the selected conformers' names to the list "list_selected_conformers_from_balloon_rdkit". It will be used to generate Gaussian inputs:

In [None]:
with open("/home/gosia/work/work_on_gitlab/icho/calcs/m7/list_selected_conformers_from_ballon_rdkit", "w") as f:
    for key, mol in allmol_m7.items():
        f.write(key+"\n")

energy_sorted = sorted(energy_m7.items(), key=lambda x: x[1])
with open("/home/gosia/work/work_on_gitlab/icho/calcs/m7/detailed_list_selected_conformers_from_ballon_rdkit", "w") as f:
    for pair in energy_sorted:
        f.write("{0:30}   {1}\n".format(pair[0], pair[1]))        