In [1]:
import ase.db #atomic simulations environment
from ase import io, Atom, Atoms

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem.EState import EStateIndices 
from rdkit.Chem.EState import AtomTypes 

import pandas as pd

from subprocess import Popen, PIPE
import os

import numpy as np

import pickle

In [2]:
def shell(cmd, shell=False):
    """ 
    runs the shell command cmd
    """
    if shell:
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
    
    else:
        cmd = cmd.split()
        p = subprocess.Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)

    output, err = p.communicate()
    
    return output.decode('utf-8')

In [3]:
def finger_print(mol,name,smiles,e_opt,homo,lumo,atom): 
    """ 
    Create a dictionary with the e-state fingerprint for the molecule in mol (rdkit mol)
    
    Input:
    mol; rdkit mol object
    name; structure name
    e_opt; energy gap (target)
    """ 
    
    types = AtomTypes.TypeAtoms(mol) 
    es = EStateIndices(mol) 
    counts, sums = Fingerprinter.FingerprintMol(mol)
    
    if AtomTypes.esPatterns is None: 
        AtomTypes.BuildPatts() 
    
    name_list = [name for name,_ in AtomTypes.esPatterns]
    
    data={'name':name,'smiles':smiles,'E_opt':e_opt,'HOMO':homo,'LUMO':lumo,'Atom':atom}
    data2 = {k: v for k,v in zip(name_list,sums)}
    
    data.update(data2)
    return data

In [4]:
if AtomTypes.esPatterns is None: 
        AtomTypes.BuildPatts() 
    
name_list = [name for name,_ in AtomTypes.esPatterns]

In [5]:
dssc = pd.read_pickle("dssc_no_br.pickle")
dssc = dssc.dropna(axis=0)
dssc.describe()

Unnamed: 0,SCF energy,E_opt,E_nm,Osc.,HOMO,LUMO,sub1,sub2,pos1,pos2
count,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0
mean,-71654.749805,2.265781,547.412962,0.480883,-7.273681,-1.591135,7.610478,17.036523,24.885639,25.301851
std,13315.161121,0.044793,10.640188,0.097478,0.196247,0.168438,5.551092,6.281933,10.300195,10.384375
min,-157008.185989,2.1644,490.85,0.1056,-7.871615,-2.108658,0.0,0.0,14.0,14.0
25%,-75783.974395,2.2349,541.5,0.4041,-7.416358,-1.708641,4.0,12.0,16.0,17.0
50%,-68941.559207,2.2577,549.17,0.4809,-7.275672,-1.591902,6.0,18.0,20.0,21.0
75%,-63379.510769,2.2896,554.77,0.547975,-7.123285,-1.471965,10.0,22.0,38.0,39.0
max,-54450.591233,2.5259,572.83,0.7936,-6.65796,-1.140455,26.0,26.0,41.0,41.0


In [6]:
dssc.head()

Unnamed: 0,name,Functional,Basis,SCF energy,E_opt,E_nm,Osc.,Atom,HOMO,LUMO,Calc.time,smiles,sub1,sub2,pos1,pos2,atom_type
3913,2-0-0-14-19,LC-wHPBE,6-31+G(d),-69773.542697,2.2466,551.87,0.4002,"[[9, 5.653210, 0.844010, 4.922620], [6, 5.1696...",-7.576909,-1.835722,"[0, 9, 34, 15.7]",FC(F)(F)C#C[c:14]1[c:13]2[C:1]3[N+:0]4=[C:11](...,0.0,0.0,14.0,19.0,"[9, 6, 9, 9, 6, 6, 6, 6, 6, 7, 6, 7, 6, 7, 6, ..."
2109,2-0-0-14-21,LC-wHPBE,6-31+G(d),-69773.641637,2.2595,548.73,0.4823,"[[9, 8.302740, 0.228869, 0.648290], [6, 7.6705...",-7.608475,-1.855586,"[0, 12, 39, 33.8]",FC(F)(F)C#C[c:14]1[c:13]2[C:1]3[N+:0]4=[C:11](...,0.0,0.0,14.0,21.0,"[9, 6, 9, 9, 6, 6, 6, 6, 6, 7, 6, 7, 6, 7, 6, ..."
751,2-0-0-14-41,LC-wHPBE,6-31+G(d),-69773.542846,2.2466,551.86,0.4002,"[[9, 6.596959, -3.425501, 1.095550], [6, 6.434...",-7.576909,-1.835722,"[0, 9, 46, 22.2]",FC(F)(F)C#C[c:14]1[c:13]2[C:1]3[N+:0]4=[C:11](...,0.0,0.0,14.0,41.0,"[9, 6, 9, 9, 6, 6, 6, 6, 6, 7, 6, 7, 6, 7, 6, ..."
4275,2-0-0-15-21,LC-wHPBE,6-31+G(d),-69773.645773,2.2315,555.6,0.5065,"[[9, -10.186699, -3.281673, 0.027570], [6, -9....",-7.686846,-1.945114,"[0, 9, 8, 30.2]",FC(F)(F)C#C[c:15]1[cH:14][c:13]2[C:1]3[N+:0]4=...,0.0,0.0,15.0,21.0,"[9, 6, 9, 9, 6, 6, 6, 6, 6, 6, 7, 6, 7, 6, 7, ..."
5275,2-0-0-16-19,LC-wHPBE,6-31+G(d),-69773.572559,2.26,548.61,0.3631,"[[9, -9.026300, -3.553340, -1.113680], [6, -8....",-7.631061,-1.894772,"[0, 9, 23, 22.8]",FC(F)(F)C#C[c:16]1[cH:15][cH:14][c:13]2[C:1]3[...,0.0,0.0,16.0,19.0,"[9, 6, 9, 9, 6, 6, 6, 6, 6, 6, 6, 7, 6, 7, 6, ..."


In [7]:
df = pd.DataFrame(columns=['name','smiles','E_opt','HOMO','LUMO','Atom']+name_list)

In [8]:
for row_index,row in dssc.iterrows():
    E_opt = (row["E_opt"])
    name = (row["name"])
    smiles = (row["smiles"])
    homo = (row["HOMO"])
    lumo =(row["LUMO"])
    atom = (row["Atom"])
    #print(smiles)
    m = Chem.MolFromSmiles(smiles)
   
    try:
        data= finger_print(m,name,smiles,E_opt,homo,lumo,atom)
        df = df.append(data,ignore_index=True)

    except AttributeError:
        print(i,formula)
    continue

In [9]:
df.describe()

Unnamed: 0,E_opt,HOMO,LUMO,sLi,ssBe,ssssBe,ssBH,sssB,ssssB,sCH3,...,sBr,sSnH3,ssSnH2,sssSnH,ssssSn,sI,sPbH3,ssPbH2,sssPbH,ssssPb
count,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,...,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0,12102.0
mean,2.265781,-7.273681,-1.591135,0.0,0.0,0.0,0.0,0.0,-2.504668,7.948159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.044793,0.196247,0.168438,0.0,0.0,0.0,0.0,0.0,0.06183,2.063004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2.1644,-7.871615,-2.108658,0.0,0.0,0.0,0.0,0.0,-2.844903,6.27059,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.2349,-7.416358,-1.708641,0.0,0.0,0.0,0.0,0.0,-2.539449,6.580512,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.2577,-7.275672,-1.591902,0.0,0.0,0.0,0.0,0.0,-2.497524,6.672038,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.2896,-7.123285,-1.471965,0.0,0.0,0.0,0.0,0.0,-2.462048,8.082756,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.5259,-6.65796,-1.140455,0.0,0.0,0.0,0.0,0.0,-2.354953,19.941272,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df.keys()

Index(['name', 'smiles', 'E_opt', 'HOMO', 'LUMO', 'Atom', 'sLi', 'ssBe',
       'ssssBe', 'ssBH', 'sssB', 'ssssB', 'sCH3', 'dCH2', 'ssCH2', 'tCH',
       'dsCH', 'aaCH', 'sssCH', 'ddC', 'tsC', 'dssC', 'aasC', 'aaaC', 'ssssC',
       'sNH3', 'sNH2', 'ssNH2', 'dNH', 'ssNH', 'aaNH', 'tN', 'sssNH', 'dsN',
       'aaN', 'sssN', 'ddsN', 'aasN', 'ssssN', 'sOH', 'dO', 'ssO', 'aaO', 'sF',
       'sSiH3', 'ssSiH2', 'sssSiH', 'ssssSi', 'sPH2', 'ssPH', 'sssP', 'dsssP',
       'sssssP', 'sSH', 'dS', 'ssS', 'aaS', 'dssS', 'ddssS', 'sCl', 'sGeH3',
       'ssGeH2', 'sssGeH', 'ssssGe', 'sAsH2', 'ssAsH', 'sssAs', 'sssdAs',
       'sssssAs', 'sSeH', 'dSe', 'ssSe', 'aaSe', 'dssSe', 'ddssSe', 'sBr',
       'sSnH3', 'ssSnH2', 'sssSnH', 'ssssSn', 'sI', 'sPbH3', 'ssPbH2',
       'sssPbH', 'ssssPb'],
      dtype='object')

In [10]:
df.to_pickle("estate_from_smile_no_br.pkl")