In [1]:
import openbabel
import pybel
import sys
import os
import os.path
import csv
import pandas as pd
import argparse
import numpy as np
import itertools
import shutil
from itertools import combinations
import re
import collections
from collections import OrderedDict
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

import matplotlib
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter
import matplotlib.pyplot as plt

%matplotlib inline

  from tqdm.autonotebook import tqdm


In [None]:
basedir = 'C:/Users/wellawat/Downloads/conf_energies'
datadir = 'C:/Users/wellawat/Downloads/conf_energies/complete_dataset'

df_gas_C= pd.read_csv(f'{basedir}/Gas_coformational_energy.csv')
df_wat_C = pd.read_csv(f'{basedir}/Water_coformational_energy.csv')
df_gas_S = pd.read_csv(f'{basedir}/Gas_strain_energy.csv')
df_wat_S = pd.read_csv(f'{basedir}/Water_strain_energy.csv')

### 1) Find which molecules have high energies

In [None]:
energies = [df_gas_C,df_wat_C,df_gas_S,df_wat_S]
phases = ['gas','water','gas','water']
out_type = ['conf','conf','strain','strain']
types = ['Delta_conformational_energy(kJ/mol)','Delta_conformational_energy(kJ/mol)','Delta_strain_energy','Delta_strain_energy']
xlabels = [r'$\Delta$ Conformation energy (kJ/mol)',r'$\Delta$ Conformation energy (kJ/mol)',r'$\Delta$ Strain energy (kJ/mol)',r'$\Delta$ Strain energy (kJ/mol)']

In [None]:
high_e_list = []
for i in range(4):
    df = energies[i]
    e = list(df[types[i]])
    etype = out_type[i]
    phase = phases[i]
    heading = types[i]
    
    if etype=='strain': thresh = 30.0

    else: thresh = 35.0
        
    percent = df[df[heading] < thresh].count().Refcodes/len(e)*100
    
    print(f'cutoff percentatge {etype}-{phase}:',percent)
    
    ref_comb = df[df[heading] > thresh]['Refcodes'].values
    for comb in ref_comb:
        ref1 = comb.split('_')[0]
        ref2 = comb.split('_')[1]
        m1m2 = df[df['Refcodes']==comb]['components'].values[0]
        m1 = m1m2.split('_')[0]
        m2 = m1m2.split('_')[1]
        
        if etype=='conf':
            file1 = f'{ref1}_m_{m1}_{phase}_xR'
            file2 = f'{ref2}_m_{m2}_{phase}_xR'
            high_e_list.append(file1)
            high_e_list.append(file2)
            
        else:
            file11 = f'{ref1}_m_{m1}_{phase}_xR'
            file11 = f'{ref1}_m_{m1}_{phase}_R'
            file22 = f'{ref2}_m_{m2}_{phase}_xR'
            file22 = f'{ref2}_m_{m2}_{phase}_R'

    #df[df[types[i]]==22000]['Courses'].values[0]
    #print(phases[i],out_type[i] ,df[df[types[i]] < 30.0 ].count()/len(e)*100)
    #plot_histogram(e,f'{basedir}/{phases[i]}_{out_type[i]}_energy.png',xlabel=xlabels[i])

### 2) Move to identified files to a new folder

In [None]:
high_e_list = list(set(high_e_list))
for file in high_e_list:
    shutil.copy(f'{datadir}/{file}.out',f'{datadir}/high_files/{file}.out')
    print(f'{file} moved to {datadir}/high_files/{file}')

### 3) Flag with end of file
NT - Normal termination
ET - Error termination

In [None]:
files = os.listdir(f'{datadir}/high_files')
for file in files:
    if file.endswith('.out'):
        f = open(f'{datadir}/high_files/{file}', "r")
        readfile = f.read()
        f.close()
        if "Normal termination" in readfile: 
            print('String Found In File',file)
            new_name = file.replace('.out','_NT.out')
            os.rename(f'{datadir}/high_files/{file}', f'{datadir}/high_files/{new_name}')
        else: 
            new_name = file.replace('.out','_ET.out')
            os.rename(f'{datadir}/high_files/{file}', f'{datadir}/high_files/{new_name}')
            print('String Not Found In File',file)

### 4) Write com files for mol2  files created with babel
Use in HPC `module load openbabel ; babel xxx.out xxx.mol2`

In [None]:
from ccdc.io import EntryReader
from ccdc.io import MoleculeReader
from ccdc.io import CrystalReader
from ccdc.io import CrystalWriter
from ccdc.io import MoleculeWriter
from ccdc import conformer

In [None]:
def create_mol(mol_file=None):
    from ccdc.io import MoleculeReader
    mol = MoleculeReader(mol_file)[0]
    engine = conformer.GeometryAnalyser()
    engine.settings.bond.analyse = False
    engine.settings.angle.analyse = False
    engine.settings.ring.analyse = False

    checked_mol = engine.analyse_molecule(mol)

    return checked_mol

In [None]:
def write_input(mol_file, save_path=None,spin_mul='0 1'):

    job_types = {'gas-xR': 'opt',
    'gas-R':  'opt=modredundant',
    'water-xR': 'opt scrf(smd, solvent=water)',
    'water-R':'opt=modredundant scrf(smd, solvent=water)'} 

    mol_name = mol_file.split('/')[-1]
    phase = mol_name .split('_')[3]
    ext = mol_name .split('_')[4]
    job = job_types[f'{phase}-{ext}']
    print(mol_file)

    mol = create_mol(mol_file)    
    coords = []
    mol.assign_bond_types(which='all')
    mol.add_hydrogens(mode='missing') 

    for a in mol.atoms:
        label = np.array([str(a.atomic_symbol)])
        xyz = np.array([c for c in a.coordinates])
        coords.append(np.concatenate((label, xyz)))

    out_name = mol_file.replace('.mol2','.com')


    with open(f'{out_name}' , 'wb') as f:
            header = f'%NProcShared=24 \n%Chk={mol_name}.chk \n%Mem=25GB \n# M062X/6-31+G** int=(acc2e=14) {job} \n\n{mol_name}-{job} \n\n{spin_mul}'
            np.savetxt(f, list(coords), delimiter=' ', newline='\n', header=header, footer='', comments='',fmt='%s')
    f.close()

    f2 = open(f'{out_name}','a')
    f2.write('\n')
    f2.close()

In [None]:
mol_dir = f'{datadir}/high_files/mol_files'
files = os.listdir(mol_dir)
for file in files:
    if file.endswith('mol2'):
        write_input(f'{mol_dir}/{file}', save_path=f'{datadir}/high_files/com_files')

## After re-running the files with NT from the last frame: find their energies and concat with complete_dataset.txt

In [38]:
basedir = 'C:/Users/wellawat/Downloads/conf_energies'
data = pd.read_csv(f'{basedir}/highE_NT_rerun_scf_done.txt', sep=' ', header=None, names=['filename', 'energy']) 
error_data = open(f'{basedir}/error_2nd_time.txt', 'r').readlines() 
new = [line.strip() for line in error_data]
error_files = [l.split('/')[-1 ].split('.')[0] for l in new]


# create edit file names
nrows = data.shape[0]
for i in range(nrows):
    fname = data['filename'][i].split('/')[-1]
    data['filename'][i] = fname.split('.')[0]

#drop rows if file ended with Error termination
for i in range(nrows):
    file = data['filename'][i] 
    if file in error_files:
        data = data.drop(i)
        print(file)
        
data.to_csv(f'{basedir}/highE_NT_rerun_scf_done.csv',index=False)

FUYJAK01_m_1_water_xR_ET
HNIABZ11_m_1_water_xR_ET
IYEBUJ01_m_1_water_xR_ET
MILHOF01_m_1_water_xR_ET
OCHTET_m_1_water_xR_ET
SUWMIG02_m_1_water_xR_ET
UJIWEQ01_m_1_water_xR_ET


In [39]:
data= pd.read_csv(f'{basedir}/highE_NT_rerun_scf_done.csv')
print(data.shape)

(73, 2)


In [40]:
nrows = data.shape[0]
for i in range(nrows):
    fname = data['filename'][i].replace('_NT','')
    data['filename'][i] = fname
    print(fname)
    
data.to_csv(f'{basedir}/highE_NT_rerun_scf_done.csv',index=False)
data.head()

APUDEV01_m_1_gas_xR
APUDEV01_m_1_water_xR
APUDEV_m_1_gas_xR
APUDEV_m_1_water_xR
BEMLOU03_m_1_gas_xR
BEMLOU23_m_1_gas_xR
ESIWUY02_m_1_gas_xR
ESIWUY_m_1_gas_xR
FODMAO01_m_1_gas_xR
FODMAO_m_1_gas_xR
FUYJAK01_m_1_gas_xR
FUYJAK_m_1_gas_xR
FUYJAK_m_1_water_xR
GEDSAM01_m_1_gas_xR
GEDSAM01_m_1_water_xR
GEDSAM_m_1_gas_xR
GEDSAM_m_1_water_xR
HAXMAW01_m_1_gas_xR
HAXMAW01_m_1_water_xR
HAXMAW_m_1_gas_xR
HAXMAW_m_1_water_xR
HNIABZ20_m_1_water_xR
ICIMAI01_m_1_gas_xR
ICIMAI_m_1_gas_xR
IHAPOX01_m_1_gas_xR
IHAPOX02_m_1_gas_xR
IHAPOX02_m_1_water_xR
IHAPOX_m_1_gas_xR
IHAPOX_m_1_water_xR
INOHIC01_m_1_gas_xR
INOHIC_m_1_gas_xR
IYEBUJ03_m_1_water_xR
KELGEO01_m_1_gas_xR
KELGEO01_m_1_water_xR
KELGEO02_m_1_gas_xR
KELGEO02_m_1_water_xR
KELGEO03_m_1_gas_xR
KELGEO03_m_1_water_xR
KELGEO_m_1_gas_xR
KELGEO_m_1_water_xR
KISQUZ01_m_1_gas_xR
KISQUZ01_m_1_water_xR
KISQUZ_m_1_gas_xR
KISQUZ_m_1_water_xR
MHQACD02_m_1_gas_xR
MHQACD02_m_1_water_xR
MHQACD_m_1_gas_xR
MHQACD_m_1_water_xR
MILHOF02_m_1_water_xR
MILHOF_m_1_water_xR


Unnamed: 0,filename,energy
0,APUDEV01_m_1_gas_xR,-3430.334139
1,APUDEV01_m_1_water_xR,-3430.336768
2,APUDEV_m_1_gas_xR,-3430.349545
3,APUDEV_m_1_water_xR,-3430.352048
4,BEMLOU03_m_1_gas_xR,-1762.472396


In [37]:
data

Unnamed: 0,filename,energy
0,NT,-3430.334139
1,NT,-3430.336768
2,NT,-3430.349545
3,NT,-3430.352048
4,NT,-1762.472396
...,...,...
68,NT,-3031.828342
69,NT,-3031.824967
70,NT,-3031.844427
71,NT,-3031.839424


### 5) Create images from mol2 files

In [None]:
import os
import pandas as pd

from rdkit import Chem
from rdkit.Chem import Draw,AllChem
from rdkit.Chem.Draw import IPythonConsole

IPythonConsole.drawOptions.addAtomIndices = True
IPythonConsole.molSize = 300,300

In [None]:
mol_dir = f'{datadir}/high_files/mol_files'
files = os.listdir(mol_dir)
smiles = []
for file in files:
    if file.endswith('mol2'):
        mol = MoleculeReader(f'{mol_dir}/{file}')[0]
        smiles.append(mol.smiles)

In [None]:
smiles[1]

In [None]:
for i, smile in enumerate(smiles):
    mol = Chem.MolFromSmiles(smile)
    mol


In [None]:
ms = [Chem.MolFromSmiles(x) for x in smiles]
Draw.MolsToGridImage(ms)

In [None]:
filePath = 'C:/Users/wellawat/Downloads/conf_energies/complete_dataset/high_files/mol_files/HAXMAW01_m_1_gas_xR_NT.out.mol2'
database=Mol2MolSupplier(filePath,sanitize=True)

In [None]:
database

In [None]:
Chem.MolToSmiles(m)