In [None]:
# Dataset curation notebook

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from ase.calculators.mopac import MOPAC
from ase.io import read
import pandas as pd
import numpy as np
import time

In [2]:
dataset = pd.read_csv('../../Data/Solubility/dataset-B.csv')

In [3]:
#SMILES
smiles = dataset.columns[4]
smiles_list = np.array(dataset[smiles])
print (smiles_list)
#ID
ID = dataset.columns[0]
id_list = np.array(dataset[ID])
print (id_list)

#initialize lists
failures = []
pse_list = []

['CCC(CC)O[N+]([O-])=O' 'Oc1cc(Br)cc(C=O)c1'
 '[H+].[Cl-].CCOC(=O)C1(CCN(C)CC1)c2ccccc2' ...
 'CCc1cccc(CC)c1N(COC)C(=O)CCl' 'CCCCOCN(C(=O)CCl)c1c(CC)cccc1CC'
 'CCOCN(C(=O)CCl)c1c(C)cccc1CC']
['B-1' 'B-2' 'B-3' ... 'B-4648' 'B-4649' 'B-4650']


In [4]:
def generate():
    
    start = time.time()
    print ('starting...')
    for i,smiles in enumerate(smiles_list):
        try:
    
            #create mol object
            mol = Chem.MolFromSmiles(smiles)
            mol = Chem.AddHs(mol)

            #embed molecule
            AllChem.EmbedMolecule(mol)

            #create xyz file
            fileName = 'xyz/'+id_list[i] + '.xyz'
            Chem.MolToXYZFile(mol,fileName)

            #use ASE to read xyz file
            mol = read(fileName)

            #calculate 
            mol.calc = MOPAC(label='TMP', task = 'UHF BONDS GRADS')

            #get potential energy
            pse = mol.get_potential_energy()

            #save into list
            pse_list.append(pse)
            
            print (id_list[i], ' done')
        except:
            print ('error in ', id_list[i])
            failures.append(smiles)
            continue

        
            
    print ('DONE. time taken = ', time.time()- start)

In [5]:
generate()

starting...
B-1  done
B-2  done
error in  B-3




B-4  done
B-5  done
B-6  done
B-7  done
B-8  done
B-9  done
B-10  done
B-11  done
B-12  done
B-13  done
B-14  done
B-15  done
B-16  done
B-17  done
B-18  done
B-19  done
B-20  done
B-21  done
B-22  done
B-23  done
B-24  done
B-25  done
B-26  done
B-27  done
B-28  done
B-29  done
B-30  done
B-31  done
B-32  done
B-33  done
B-34  done
B-35  done
B-36  done
B-37  done
B-38  done
B-39  done
B-40  done
B-41  done
B-42  done
B-43  done
B-44  done
B-45  done
B-46  done
B-47  done
B-48  done
B-49  done
B-50  done
B-51  done
B-52  done
B-53  done
B-54  done
B-55  done
B-56  done
B-57  done
B-58  done
B-59  done
B-60  done
B-61  done
B-62  done
B-63  done
B-64  done
B-65  done
B-66  done
B-67  done
B-68  done
B-69  done
B-70  done
B-71  done
B-72  done
B-73  done
B-74  done
B-75  done
error in  B-76




B-77  done
B-78  done
B-79  done
B-80  done
B-81  done
B-82  done
B-83  done
B-84  done
B-85  done
B-86  done
B-87  done
B-88  done
B-89  done
B-90  done
B-91  done
B-92  done
B-93  done
B-94  done
B-95  done
B-96  done
B-97  done
B-98  done
B-99  done
B-100  done
B-101  done
B-102  done
B-103  done
B-104  done
B-105  done


RDKit ERROR: [20:24:26] UFFTYPER: Unrecognized charge state for atom: 4


B-106  done
B-107  done
B-108  done
B-109  done
B-110  done
B-111  done
B-112  done
B-113  done
B-114  done
error in  B-115
B-116  done
B-117  done
B-118  done
error in  B-119
B-120  done
B-121  done
error in  B-122




B-123  done
B-124  done
B-125  done
B-126  done
B-127  done
B-128  done
B-129  done
B-130  done
B-131  done
B-132  done
B-133  done
B-134  done
B-135  done
B-136  done
B-137  done
B-138  done
B-139  done
B-140  done
B-141  done
B-142  done
B-143  done
B-144  done
B-145  done
B-146  done
B-147  done
B-148  done
B-149  done
B-150  done
B-151  done
B-152  done
B-153  done
B-154  done
B-155  done
B-156  done
B-157  done
B-158  done
B-159  done
B-160  done
B-161  done
B-162  done
B-163  done
B-164  done
B-165  done
B-166  done
B-167  done
B-168  done
B-169  done
B-170  done
B-171  done
B-172  done
B-173  done
B-174  done
B-175  done
B-176  done
B-177  done
B-178  done
B-179  done
B-180  done
B-181  done
B-182  done
B-183  done


RDKit ERROR: [20:38:39] UFFTYPER: Unrecognized charge state for atom: 1


B-184  done
B-185  done
B-186  done
B-187  done
B-188  done
B-189  done
B-190  done
B-191  done
B-192  done
B-193  done
B-194  done
B-195  done
B-196  done
B-197  done
B-198  done
B-199  done
B-200  done
B-201  done
B-202  done
B-203  done
B-204  done
B-205  done
B-206  done
B-207  done
B-208  done
B-209  done
B-210  done
B-211  done
B-212  done
B-213  done
B-214  done
B-215  done
B-216  done
B-217  done
B-218  done
B-219  done
B-220  done
B-221  done
B-222  done
B-223  done
B-224  done
B-225  done
B-226  done
B-227  done
B-228  done
B-229  done
B-230  done
B-231  done
B-232  done
B-233  done
B-234  done
B-235  done
B-236  done
B-237  done
B-238  done
B-239  done
B-240  done
B-241  done
B-242  done
B-243  done
B-244  done
B-245  done
B-246  done
B-247  done
B-248  done
B-249  done
B-250  done
B-251  done
B-252  done
B-253  done
B-254  done
B-255  done
B-256  done
B-257  done
B-258  done
B-259  done
B-260  done
B-261  done
B-262  done
B-263  done
B-264  done
B-265  done
B-266  done
B-26

RDKit ERROR: [21:00:44] UFFTYPER: Unrecognized charge state for atom: 1


B-742  done
error in  B-743


RDKit ERROR: [21:00:44] UFFTYPER: Unrecognized charge state for atom: 2


B-744  done
B-745  done
B-746  done
B-747  done
B-748  done
B-749  done
B-750  done
B-751  done
B-752  done
B-753  done
B-754  done
B-755  done
B-756  done
B-757  done
B-758  done
B-759  done
B-760  done
B-761  done


RDKit ERROR: [21:04:35] UFFTYPER: Unrecognized atom type: Ag5 (0)


B-762  done
B-763  done
B-764  done
B-765  done
B-766  done
B-767  done
B-768  done
B-769  done
B-770  done
B-771  done
B-772  done
B-773  done
B-774  done
B-775  done
B-776  done
B-777  done
B-778  done
B-779  done
B-780  done
B-781  done
B-782  done
B-783  done
B-784  done
B-785  done
B-786  done
B-787  done
B-788  done
B-789  done
B-790  done
error in  B-791




B-792  done
B-793  done
B-794  done
B-795  done
B-796  done
B-797  done
B-798  done
B-799  done
B-800  done
B-801  done
B-802  done
B-803  done
B-804  done
B-805  done
B-806  done
B-807  done
B-808  done
B-809  done
B-810  done
B-811  done
B-812  done
B-813  done
error in  B-814
B-815  done
B-816  done
B-817  done
B-818  done
B-819  done
B-820  done
B-821  done
B-822  done
B-823  done
B-824  done
B-825  done
B-826  done
error in  B-827
error in  B-828




B-829  done
B-830  done
B-831  done
B-832  done
B-833  done
B-834  done
B-835  done
B-836  done
error in  B-837
B-838  done


RDKit ERROR: [21:07:36] UFFTYPER: Unrecognized charge state for atom: 1


B-839  done
B-840  done
B-841  done
B-842  done
B-843  done
B-844  done
B-845  done
B-846  done
B-847  done
B-848  done
B-849  done
B-850  done
error in  B-851
B-852  done
B-853  done
B-854  done
B-884  done
B-885  done
B-886  done
B-887  done
B-888  done
B-889  done
B-890  done
B-891  done
B-892  done
B-893  done
B-894  done
B-895  done
B-896  done
B-897  done
B-898  done
B-899  done
error in  B-900
B-901  done
B-902  done
B-903  done
B-904  done
B-905  done
B-906  done
B-907  done
B-908  done
B-909  done
B-910  done
B-911  done
B-912  done
B-913  done
B-914  done
B-915  done
B-916  done
B-917  done
B-918  done
B-919  done
B-920  done
B-921  done
B-922  done
B-923  done
B-924  done
B-925  done
error in  B-926
B-927  done


RDKit ERROR: [21:11:41] UFFTYPER: Unrecognized charge state for atom: 1


B-928  done
B-929  done
B-930  done
B-931  done
B-932  done
B-933  done
B-934  done
B-935  done
B-936  done
B-937  done
B-938  done
B-939  done
B-940  done
B-941  done
B-942  done
B-943  done
B-944  done
B-945  done
B-946  done
B-947  done
B-948  done
B-949  done
B-950  done
B-951  done
B-952  done
B-953  done
B-954  done
B-955  done
B-956  done
B-957  done
error in  B-958
B-959  done
B-960  done
B-961  done
B-962  done
B-963  done
B-964  done
B-965  done
B-966  done
B-967  done
B-968  done
B-969  done
B-970  done
B-971  done
B-972  done
B-973  done
B-974  done
B-975  done
B-976  done
B-977  done
B-978  done
B-979  done
B-980  done
B-981  done
B-982  done
B-983  done
B-984  done
B-985  done
B-986  done
B-987  done
B-988  done
B-989  done
B-990  done
B-991  done
B-992  done
error in  B-993
B-994  done
B-995  done
B-996  done
B-997  done
B-998  done
B-999  done
B-1000  done
B-1001  done
B-1002  done
B-1003  done
B-1004  done
B-1005  done
B-1006  done
B-1007  done
B-1008  done
B-1009  don

RDKit ERROR: [21:11:41] Cannot write molecules with no conformers to XYZ block
RDKit ERROR: [21:12:48] Cannot write molecules with no conformers to XYZ block
RDKit ERROR: [21:16:15] Cannot write molecules with no conformers to XYZ block
RDKit ERROR: [21:21:06] UFFTYPER: Unrecognized atom type: Cu3 (0)


B-1079  done
B-1080  done
B-1081  done
B-1082  done
B-1083  done
B-1084  done
B-1085  done
B-1086  done
B-1087  done
B-1088  done
B-1089  done
B-1090  done
B-1091  done
B-1092  done
B-1093  done
error in  B-1094




B-1095  done
B-1096  done
B-1097  done
B-1098  done
B-1099  done
B-1100  done
B-1101  done
B-1102  done
B-1103  done
B-1104  done
B-1105  done
B-1106  done
B-1107  done
B-1108  done
B-1109  done
B-1110  done
B-1111  done
B-1112  done
B-1113  done
B-1114  done
B-1115  done
B-1116  done
B-1117  done
B-1118  done
B-1119  done
B-1120  done
B-1121  done
B-1122  done
B-1123  done
B-1124  done
B-1125  done
B-1126  done
B-1127  done
B-1128  done
error in  B-1129


RDKit ERROR: [21:22:45] UFFTYPER: Unrecognized atom type: Ba (0)


B-1130  done
B-1131  done
B-1132  done
B-1133  done
B-1134  done
B-1135  done
B-1136  done
B-1137  done
B-1138  done
B-1139  done
B-1140  done
B-1141  done
B-1142  done
B-1143  done
error in  B-1144


RDKit ERROR: [21:22:54] UFFTYPER: Unrecognized atom type: Ba (0)


B-1145  done
B-1146  done
error in  B-1147


RDKit ERROR: [21:22:56] UFFTYPER: Unrecognized atom type: Sr (0)


B-1148  done
B-1149  done
B-1150  done
B-1151  done
B-1152  done
B-1153  done
B-1154  done
B-1155  done
B-1156  done
B-1157  done
B-1158  done
B-1159  done
B-1160  done
B-1161  done
B-1162  done
B-1163  done
B-1164  done
B-1165  done
B-1166  done
B-1167  done
B-1168  done
B-1169  done
B-1170  done
B-1171  done
B-1172  done
B-1173  done
B-1174  done
B-1175  done
B-1176  done
B-1177  done


RDKit ERROR: [21:25:08] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [21:25:08] UFFTYPER: Unrecognized atom type: Zn+2 (0)


B-1178  done
B-1179  done
B-1180  done
B-1181  done
B-1182  done
B-1183  done
B-1184  done
B-1185  done
error in  B-1186


RDKit ERROR: [21:25:30] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [21:25:30] UFFTYPER: Unrecognized atom type: Zn+2 (0)


B-1187  done
error in  B-1188
B-1189  done


RDKit ERROR: [21:25:30] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [21:25:30] UFFTYPER: Unrecognized atom type: Zn+2 (0)


B-1190  done
B-1191  done
B-1192  done
B-1193  done
B-1194  done
B-1195  done
B-1196  done
B-1197  done
B-1198  done
B-1199  done
B-1200  done
B-1201  done
B-1202  done
B-1203  done
B-1204  done
B-1205  done
B-1206  done
B-1207  done
B-1208  done
B-1209  done
B-1210  done
B-1211  done
B-1212  done
B-1213  done
B-1214  done
B-1215  done
B-1216  done
B-1217  done
B-1218  done
B-1219  done
B-1220  done
B-1221  done
B-1222  done
B-1223  done
B-1224  done
B-1225  done
B-1226  done
B-1227  done
B-1228  done
B-1229  done
B-1230  done
B-1231  done
B-1232  done
error in  B-1233
B-1234  done
B-1235  done
B-1236  done
B-1237  done
B-1238  done
B-1239  done
B-1240  done
B-1241  done
B-1242  done
B-1243  done
B-1244  done
B-1245  done
B-1246  done
B-1247  done
B-1248  done
B-1249  done
B-1250  done
B-1251  done
B-1252  done
B-1253  done
B-1254  done
B-1255  done
B-1256  done
B-1257  done
B-1258  done
B-1259  done
B-1260  done
B-1261  done
error in  B-1262
B-1263  done
B-1264  done
B-1265  done
B-12

RDKit ERROR: [21:37:31] UFFTYPER: Unrecognized atom type: Ca (0)


B-1515  done
B-1516  done
B-1517  done
B-1518  done
B-1519  done
B-1520  done
B-1521  done
B-1522  done
B-1523  done
B-1524  done
B-1525  done
B-1526  done
B-1527  done
B-1528  done
B-1529  done
B-1530  done
error in  B-1531
B-1532  done
B-1533  done
B-1534  done
B-1535  done
B-1536  done
B-1537  done
B-1538  done
B-1539  done
B-1540  done
B-1541  done
B-1542  done
B-1543  done
B-1544  done
B-1545  done
B-1546  done
B-1547  done
B-1548  done
B-1549  done
B-1550  done
B-1551  done
B-1552  done
B-1553  done
B-1554  done
B-1555  done
B-1556  done
B-1557  done
B-1558  done
B-1559  done
B-1560  done
B-1561  done
B-1562  done
B-1563  done
B-1564  done
B-1565  done
B-1566  done
B-1567  done
B-1568  done
B-1569  done
B-1570  done
B-1571  done
B-1572  done
B-1573  done
B-1574  done
B-1575  done
B-1576  done
B-1577  done
B-1578  done
B-1579  done
B-1580  done
B-1581  done
B-1582  done
B-1583  done
B-1584  done
B-1585  done
B-1586  done
B-1587  done
B-1588  done
B-1589  done
error in  B-1590




B-1591  done
B-1592  done
B-1593  done
B-1594  done
B-1595  done
B-1596  done
B-1597  done
B-1598  done
B-1599  done
B-1600  done
B-1601  done
B-1602  done
B-1603  done
B-1604  done
B-1605  done
B-1606  done
B-1607  done
B-1608  done
B-1609  done
B-1610  done
B-1611  done
B-1612  done
B-1613  done
B-1614  done
B-1615  done
B-1616  done
B-1617  done
B-1618  done
B-1619  done
B-1620  done
B-1621  done
B-1622  done
B-1623  done
B-1624  done
B-1625  done
B-1626  done
B-1627  done
B-1628  done
B-1629  done
B-1630  done
B-1631  done
B-1632  done
B-1633  done
B-1634  done
B-1635  done
B-1636  done
B-1637  done
B-1638  done
B-1639  done
B-1640  done
B-1641  done
B-1642  done
B-1643  done
B-1644  done
B-1645  done
B-1646  done
B-1647  done
B-1648  done
B-1649  done
B-1650  done
B-1651  done
B-1652  done
B-1653  done
B-1654  done
B-1655  done
B-1656  done
B-1657  done
B-1658  done
B-1659  done
B-1660  done
B-1661  done
B-1662  done
B-1663  done
B-1664  done
B-1665  done
B-1666  done
B-1667  done

RDKit ERROR: [21:48:32] UFFTYPER: Unrecognized charge state for atom: 5


B-1723  done
B-1724  done
B-1725  done
B-1726  done
B-1727  done
B-1728  done
B-1729  done
B-1730  done
B-1731  done
B-1732  done
B-1733  done
B-1734  done
error in  B-1735




B-1736  done
B-1737  done
B-1738  done
B-1739  done
B-1740  done
B-1741  done
B-1742  done
B-1743  done
B-1744  done
B-1745  done
B-1746  done
B-1747  done
B-1748  done
B-1749  done
B-1750  done
B-1751  done
B-1752  done
B-1753  done
B-1754  done
B-1755  done
B-1756  done
B-1757  done
B-1758  done
B-1759  done
B-1760  done
B-1761  done
B-1762  done
B-1763  done
B-1764  done
B-1765  done
B-1766  done
B-1767  done
B-1768  done
B-1769  done
B-1770  done
B-1771  done
B-1772  done
B-1773  done
B-1774  done
B-1775  done
B-1776  done
B-1777  done
B-1778  done
B-1779  done
B-1780  done
B-1781  done
B-1782  done
B-1783  done
B-1784  done
B-1785  done
B-1786  done
B-1787  done
B-1788  done
B-1789  done
B-1790  done
B-1791  done
B-1792  done
B-1793  done
B-1794  done
B-1795  done
B-1796  done
B-1797  done
B-1798  done
B-1799  done
B-1800  done
B-1801  done
B-1802  done
B-1803  done
B-1804  done
B-1805  done
B-1806  done
B-1807  done
B-1808  done


RDKit ERROR: [21:52:40] UFFTYPER: Unrecognized charge state for atom: 1


B-1809  done
B-1810  done
B-1811  done
B-1812  done
B-1813  done
B-1814  done
B-1815  done
B-1816  done
B-1817  done
B-1818  done
B-1819  done
B-1820  done
B-1821  done
B-1822  done
B-1823  done
B-1824  done
B-1825  done
B-1826  done
B-1827  done
B-1828  done
B-1829  done
B-1830  done
B-1831  done
B-1832  done
B-1833  done
B-1834  done
B-1835  done
B-1836  done
B-1837  done
B-1838  done
B-1839  done
B-1840  done
B-1841  done
B-1842  done
B-1843  done
B-1844  done
B-1845  done
B-1846  done
B-1847  done
B-1848  done
B-1849  done
B-1850  done
B-1851  done
B-1852  done
B-1853  done
B-1854  done
B-1855  done
B-1856  done
B-1857  done
B-1858  done
B-1859  done
B-1860  done
B-1861  done
B-1862  done
B-1863  done
B-1864  done
B-1865  done
B-1866  done
B-1867  done
B-1868  done
B-1869  done
B-1870  done
B-1871  done
B-1872  done
B-1873  done
B-1874  done
B-1875  done
B-1876  done
B-1877  done
B-1878  done
B-1879  done
B-1880  done
B-1881  done
B-1882  done
B-1883  done
B-1884  done
B-1885  done

RDKit ERROR: [22:03:43] UFFTYPER: Unrecognized atom type: Se2+2 (8)


B-1998  done
B-1999  done
B-2000  done
B-2001  done
B-2002  done
B-2003  done
B-2004  done
B-2005  done
B-2006  done
B-2007  done
B-2008  done
B-2009  done
B-2010  done
B-2011  done
B-2012  done
B-2013  done
B-2014  done
B-2015  done
B-2016  done
B-2017  done
B-2018  done
B-2019  done
B-2020  done
B-2021  done
B-2022  done
B-2023  done
B-2024  done
B-2025  done
B-2026  done
B-2027  done
B-2028  done
B-2029  done
B-2030  done
B-2031  done
B-2032  done
B-2033  done
B-2034  done
B-2035  done
B-2036  done
B-2037  done
B-2038  done
B-2039  done
B-2040  done
B-2041  done
B-2042  done
B-2043  done
B-2044  done
B-2045  done
B-2046  done
B-2047  done
B-2048  done
B-2049  done
B-2050  done
B-2051  done
B-2052  done
B-2053  done
B-2054  done
B-2055  done
B-2056  done
B-2057  done
B-2058  done
B-2059  done
B-2060  done
B-2061  done
B-2062  done
B-2063  done
B-2064  done
B-2065  done
B-2066  done
B-2067  done
B-2068  done
B-2069  done
B-2070  done
B-2071  done
B-2072  done
B-2073  done
B-2074  done



B-2323  done
B-2324  done
B-2325  done
B-2326  done
B-2327  done
B-2328  done
B-2329  done
error in  B-2330
B-2331  done
error in  B-2332


RDKit ERROR: [22:22:32] UFFTYPER: Unrecognized atom type: Cu3 (0)
RDKit ERROR: [22:22:32] UFFTYPER: Unrecognized charge state for atom: 1


B-2333  done
B-2334  done
B-2335  done
B-2336  done
B-2337  done
B-2338  done
B-2339  done
B-2340  done
B-2341  done
B-2342  done
B-2343  done
B-2344  done
B-2345  done
B-2346  done
B-2347  done
B-2348  done
B-2349  done
B-2350  done
B-2351  done
B-2352  done
B-2353  done
B-2354  done
B-2355  done
B-2356  done
B-2357  done
B-2358  done
B-2359  done
B-2360  done
B-2361  done
B-2362  done
B-2363  done
B-2364  done
B-2365  done
B-2366  done
B-2367  done
B-2368  done
B-2369  done
error in  B-2370


RDKit ERROR: [22:29:38] UFFTYPER: Unrecognized atom type: Mn3 (0)


B-2371  done
B-2372  done


RDKit ERROR: [22:30:31] UFFTYPER: Unrecognized atom type: Ge2 (5)
RDKit ERROR: [22:30:31] UFFTYPER: Unrecognized atom type: Ge2 (8)


B-2373  done
B-2374  done
B-2375  done
B-2376  done
B-2377  done
B-2378  done
B-2379  done
B-2380  done
B-2381  done
B-2382  done
B-2383  done
B-2384  done
B-2385  done
B-2386  done
B-2387  done
B-2388  done
B-2389  done
B-2390  done
B-2391  done
B-2392  done
B-2393  done
B-2394  done
B-2395  done
B-2396  done
B-2397  done
B-2398  done
B-2399  done
B-2400  done
B-2401  done
B-2402  done
B-2403  done
B-2404  done
B-2405  done
B-2406  done
B-2407  done
B-2408  done
B-2409  done
B-2410  done
B-2411  done
B-2412  done
B-2413  done
B-2414  done
B-2415  done
B-2416  done
B-2417  done
B-2418  done
B-2419  done
B-2420  done
B-2421  done
B-2422  done
B-2423  done
B-2424  done
B-2425  done
B-2426  done
B-2427  done
B-2428  done
B-2429  done
B-2430  done
B-2431  done
B-2432  done
B-2433  done
B-2434  done
B-2435  done
B-2436  done
B-2437  done
B-2438  done
B-2439  done
B-2440  done
B-2441  done
B-2442  done
B-2443  done
B-2444  done
B-2445  done
B-2446  done
B-2447  done
B-2448  done
B-2449  done

RDKit ERROR: [22:39:48] UFFTYPER: Unrecognized atom type: Pt3 (0)


B-2464  done
B-2465  done
B-2466  done
B-2467  done
B-2468  done
B-2469  done
B-2470  done


RDKit ERROR: [22:40:07] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [22:40:07] UFFTYPER: Unrecognized atom type: Zn+2 (0)


B-2471  done
B-2472  done
B-2473  done
B-2474  done
B-2475  done
B-2476  done
B-2477  done
B-2478  done
B-2479  done
B-2480  done
B-2481  done
B-2482  done
B-2483  done
B-2484  done
B-2485  done
B-2486  done
B-2487  done
B-2488  done
B-2489  done
B-2490  done
B-2491  done
B-2492  done
B-2493  done
B-2494  done
B-2495  done
B-2496  done
B-2497  done
B-2498  done
B-2499  done
B-2500  done
B-2501  done
B-2502  done
B-2503  done
B-2504  done
B-2505  done
B-2506  done
B-2507  done
B-2508  done
B-2509  done
B-2510  done
B-2511  done
B-2512  done
B-2513  done
B-2514  done
B-2515  done
B-2516  done
B-2517  done
B-2518  done
B-2519  done
B-2520  done
B-2521  done
B-2522  done
B-2523  done
B-2524  done
B-2525  done
B-2526  done
B-2527  done
B-2528  done
B-2529  done
B-2530  done
B-2531  done
B-2532  done
B-2533  done
B-2534  done
B-2535  done
B-2536  done
B-2537  done
B-2538  done
B-2539  done
B-2540  done
B-2541  done
B-2542  done
B-2543  done
B-2544  done
B-2545  done
B-2546  done
B-2547  done



B-2631  done
B-2632  done
B-2633  done
B-2634  done
B-2635  done
B-2636  done
B-2637  done
B-2638  done
B-2639  done
B-2640  done
B-2641  done
B-2642  done
B-2643  done
B-2644  done
B-2645  done
B-2646  done
B-2647  done
B-2648  done
B-2649  done
B-2650  done
B-2651  done
B-2652  done
B-2653  done
B-2654  done
B-2655  done
B-2656  done
B-2657  done
B-2658  done
B-2659  done
B-2660  done
B-2661  done
B-2662  done
B-2663  done
B-2664  done
B-2665  done
B-2666  done
B-2667  done
B-2668  done
B-2669  done
B-2670  done
B-2671  done
B-2672  done
B-2673  done
B-2674  done
B-2675  done
B-2676  done
B-2677  done
B-2678  done
B-2679  done
B-2680  done
B-2681  done
B-2682  done
B-2683  done
B-2684  done
B-2685  done
B-2686  done
B-2687  done
B-2688  done
error in  B-2689
B-2690  done
B-2691  done
B-2692  done
B-2693  done
B-2694  done
B-2695  done
B-2696  done
B-2697  done
B-2698  done
B-2699  done
B-2700  done
B-2701  done
B-2702  done
B-2703  done
B-2704  done
B-2705  done
B-2706  done
B-2707  



B-3172  done
B-3173  done
B-3174  done
B-3175  done
B-3176  done
B-3177  done
B-3178  done
B-3179  done
B-3180  done
B-3181  done
B-3182  done
B-3183  done
B-3184  done
B-3185  done
B-3186  done
B-3187  done
B-3188  done
B-3189  done
B-3190  done
B-3191  done
B-3192  done
B-3193  done
B-3194  done
B-3195  done
B-3196  done
B-3197  done
B-3198  done
B-3199  done
B-3200  done
B-3201  done
B-3202  done
B-3203  done
B-3204  done
B-3205  done
B-3206  done
B-3207  done
B-3208  done
B-3209  done
B-3210  done
B-3211  done
B-3212  done
B-3213  done
B-3214  done
B-3215  done
B-3216  done
B-3217  done
B-3218  done
B-3219  done
B-3220  done
B-3221  done
B-3222  done
B-3223  done
B-3224  done
B-3225  done
B-3226  done
B-3227  done
B-3228  done
B-3229  done
B-3230  done
B-3231  done
B-3232  done
B-3240  done
B-3241  done
B-3242  done
B-3243  done
B-3244  done
B-3245  done
B-3246  done
B-3247  done
B-3248  done
B-3249  done
B-3250  done
B-3251  done
B-3252  done
B-3253  done
B-3254  done
B-3255  done

RDKit ERROR: [01:03:14] UFFTYPER: Unrecognized charge state for atom: 4


B-4352  done
error in  B-4353
B-4354  done
B-4355  done
B-4356  done


RDKit ERROR: [01:03:17] UFFTYPER: Unrecognized charge state for atom: 0


B-4357  done
B-4358  done
B-4359  done
B-4360  done
B-4361  done
B-4362  done
B-4363  done
B-4364  done
B-4365  done
B-4366  done
B-4367  done
B-4368  done
B-4369  done
B-4370  done
B-4371  done
B-4372  done
B-4373  done
B-4374  done
B-4375  done
B-4376  done
B-4377  done
B-4378  done
B-4379  done
B-4380  done
B-4381  done
B-4382  done
B-4383  done
B-4384  done
B-4385  done
B-4386  done
B-4387  done
B-4388  done
B-4389  done
B-4390  done
B-4391  done
B-4392  done
B-4393  done
B-4394  done
B-4395  done
B-4396  done
B-4397  done
B-4398  done
B-4399  done
B-4400  done
B-4401  done
B-4402  done
B-4403  done
B-4404  done
B-4405  done
B-4406  done
B-4407  done
B-4408  done
B-4409  done
B-4410  done
B-4411  done
B-4412  done
B-4413  done
B-4414  done
B-4415  done
B-4416  done
B-4417  done
B-4418  done
B-4419  done
B-4420  done
B-4421  done
B-4422  done
B-4423  done
B-4424  done
B-4425  done
B-4426  done
B-4427  done
B-4428  done
B-4429  done
B-4430  done
B-4431  done
B-4432  done
B-4433  done

In [11]:
# remove molecules that did not work

In [9]:
id_to_remove = []
for i, smile in enumerate (smiles_list):
    if smile in failures:
        id_to_remove.append(i)

new_smiles_list = np.delete(smiles_list, id_to_remove)      

In [12]:
#double check list lengths
print(len(pse_list))
print(len(new_smiles_list))

4609
4609


In [19]:
# convert to dataframe
def make_dataframe(new_smiles_list, pse_list):
    descriptor_data = { 'Smiles': new_smiles_list, 'Energy':pse_list}
    df = pd.DataFrame(data = descriptor_data)
    df.to_csv('../../Data/Energy/EnergyDataset-B.csv')
    print ('shape: ', df.shape)

In [20]:
make_dataframe(new_smiles_list, pse_list)

shape:  (4609, 2)
