In [6]:
from rdkit import Chem
from rdkit.Chem import AllChem
from ase.calculators.mopac import MOPAC
from ase.io import read
import pandas as pd
import numpy as np
import time
from ase import Atoms
import json
import pickle

In [2]:
import mordred
from mordred import Calculator, descriptors
from rdkit.Chem import rdMolDescriptors

In [108]:
# load periodic table json file
a=json.load(open("portable.json"))

dataset = pd.read_csv('../../Data/Energy/EnergyDataset-H.csv')
mol_energy = dataset.columns[2]
energy_list = np.array(dataset[mol_energy])

smiles = dataset.columns[1]
smiles_list = np.array(dataset[smiles])


In [4]:
# load periodic table json file
a=json.load(open("perotable.json"))

# create dictionary of elements...key: atomic number, value: atomic symbol
table=a['elements']
elemnt_table={}
# first 40 elements (not likely to find metals, etc), can change if wanted
for i in table[:55]:
    elemnt_table.update({i['number']:i['symbol']})
# create a dictionary of energies per atom...key: atomic number, value: energy via PM7/MOPAC
energy_dict={}
for i,symbol in elemnt_table.items():
    
        #create ASE atoms object
        #set position to axis
        mol = Atoms(symbol, positions=[(0, 0, 0)])
        
        #calculate 
        mol.calc = MOPAC(label=f'atomic_energy{symbol}', task = 'UHF')

        #get potential energy
        pse = mol.get_potential_energy()

        energy_dict.update ({i:pse})
print(energy_dict)

{1: -11.07011, 2: -54.09664, 3: -4.80412, 4: -25.26494, 5: -50.84441, 6: -115.15732, 7: -177.85963, 8: -289.97952, 9: -462.69609, 10: -272.22972, 11: -5.81548, 12: -18.66074, 13: -62.20211, 14: -96.94941, 15: -170.60826, 16: -173.44087, 17: -265.24998, 18: -264.22578, 19: -4.88807, 20: -19.09281, 21: -37.25729, 22: -73.75413, 23: -119.28588, 24: -260.39534, 25: -226.56671, 26: -390.66498, 27: -398.01083, 28: -375.58231, 29: -673.16583, 30: -26.97859, 31: -57.28036, 32: -84.20003, 33: -123.88732, 34: -156.69076, 35: -221.41628, 36: -259.00171, 37: -4.12096, 38: -14.89116, 39: -34.67746, 40: -52.67139, 41: -127.57961, 42: -187.81216, 43: -204.71501, 44: -214.02948, 45: -396.89494, 46: -520.81164, 47: -541.67938, 48: -27.35116, 49: -53.1579, 50: -86.40474, 51: -118.62499, 52: -255.21535, 53: -222.90791, 54: -364.51658, 55: -3.99631}


In [7]:
with open('energy_dict.pickle', 'wb') as handle:
    pickle.dump(energy_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
with open('energy_dict.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [67]:
# f = lambda x: x.GetAtomicNum()
f = lambda x: x.GetAtomicNum()

In [58]:
def atoms_to_energy(rdkit_atoms,energy_dict):
    sum_e=0
    atoms = rdkit_atoms.GetAtoms()
    for i in atoms:
        sum_e += energy_dict[f(i)]
    return sum_e

In [59]:
def get_diss_energy(smiles,energy_dict):
    """
    return Dissociate energy, ground state energy, atomic energy [float:float,float]
    """
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    # how do you get atoms
    sum_e=atoms_to_energy(mol,energy_dict)

    return sum_e

In [109]:
len(smiles_list)

4609

In [110]:
change_list = []
failures = []

In [111]:
def generate_all_changes():
    for i, smiles in enumerate (smiles_list):
        try:
            ind_energy = get_diss_energy(smiles, energy_dict)
            change = energy_list[i] - ind_energy
            change_list.append(change)
        except:
            failures.append(smiles)
            continue

In [112]:
generate_all_changes()

In [113]:
len(failures)

4

In [114]:
id_to_remove = []
for i, smile in enumerate (smiles_list):
    if smile in failures:
        id_to_remove.append(i)

new_smiles_list = np.delete(smiles_list, id_to_remove) 

In [115]:
print(len(change_list))
print(len(new_smiles_list))

4605
4605


In [116]:
# convert to dataframe
descriptor_data = { 'Smiles': new_smiles_list, 'Change-Energy':change_list}
df = pd.DataFrame(data = descriptor_data)
df.to_csv('../../Data/Energy/ChangeDataset-B.csv')
print ('shape: ', df.shape)

shape:  (4605, 2)
