In [1]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem.QED import qed
from rdkit.Chem.Crippen import MolLogP

IPythonConsole.ipython_useSVG=True 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
import sys
from io import StringIO
import os.path

In [4]:
moses_canonical = pd.read_csv("../data/generated/moses_canonical.csv")
guacamol_canonical = pd.read_csv("../data/generated/guacamol_canonical.csv")

In [5]:
def replacing_spaces(chain_str):

    return chain_str.replace(" ", "")

In [6]:
moses_canonical["SMILES"] = moses_canonical["SMILES"].apply(replacing_spaces)

In [None]:
guacamol_canonical["SMILES"] =guacamol_canonical["SMILES"].apply(replacing_spaces)

In [7]:

error_dict={
'marked aromatic' : 'aromaticity',
'Unkekulized atoms' : 'aromaticity',
'syntax error' : 'syntax',
'duplicates bond' : 'ring',
'duplicated ring closure' : 'ring',
'unclosed ring': 'ring',
'extra close parentheses': 'parenthesis',
'extra open parentheses' : 'parenthesis',
'Explicit valence' : 'valence'
}

def error_log_classification(samples : str,error_dict : dict[str,str])-> str:
    """
    This function will take a SMILES that has already been checked for validity and will retrieve the error type.
    Args:
    samples (str): a SMILES string to be checked
    error_dict (dict): a dictionary containing the keywords and their corresponding error types
    Returns:
    error_type (str): the type of error found in the SMILES string
    """
    ### declare local variable
    error_type= "unknown"

    ### redirect error log
    sio = sys.stderr = StringIO()

    ### generate mol
    mol= Chem.MolFromSmiles(samples)

    ### chek if an error happened
    error_log =sio.getvalue()
    
    
    ### iterate through keywords to find error type
    for keyword in error_dict:
        if keyword in error_log:
            error_type= error_dict[keyword]
            
            break

    return error_type,mol

In [None]:
#pour chaque token, le mol associé
mol_from_moses_can = []
mol_from_guacamol_can = []
'mol_from_clearSmiles = []'
#pour chaque token, l'erreur associée
error_vocab_moses_can = []
error_vocab_guacamol_can = []
"""error_vocab_clearSmiles = []"""

for i in moses_canonical["SMILES"]:
    error,mol = error_log_classification(i,error_dict=error_dict)
    mol_from_moses_can.append(mol)
    error_vocab_moses_can.append(error)

for i in guacamol_canonical["SMILES"]:
    error,mol = error_log_classification(i,error_dict=error_dict)
    mol_from_guacamol_can.append(mol)
    error_vocab_guacamol_can.append(error)


"""
for i in clearSmiles_vocab:
    error,mol = error_log_classification(i,error_dict=error_dict)
    mol_from_clearSmiles.append(mol)
    error_vocab_clearSmiles.append(error)"""

C1CC(=O)N(Cc2ccccn2)C1Fc1cccnc1Oc1cccnc1F2CCC2O2O2C31OCC(C)C#N)NCC(F)(F)FFc1ccc(C2(COc1cccnc1Fc1cccc(F)c1C#N4CCOCC1CC1(Cl)Cl)N1CCCCCC1c1cccnc1Cc1cccc(F)c1NCC(O)COCC1CC1Cc1cccc(F)c1NC2=Oc1cccnc1Fc1cccnc1N1CCOCC1c1cccnc1Fc1cccc(Cl)c1C(C)=Oc1cccnc1N1CCCCC1=O)N1CCc2cc(F)ccc2C1=O)N(Cc1cccc(OC2CCC2OCC(C)(C)OC2CC1CN(S(=O)(=O)N(C)C)C3Oc1ncccc1Fc1ccc(C2=Cc1ccccc1O)N1CC2CCCCC2COCC1CN(C(=O)c1ccc[nH]1)Cc1cccnc1)N1CCC1COCC1CC1(Cl)ClCc1cccc(F)c1Fc1cccnc1-n1cncn1c1ccncc1Fc1ccc(C2CN(Cc3ccncc3)CC1c1cccc(F)c1)N1CCCC1c1nc(C2CC2)no1)N1CCC(n2cnccc1-c1ccncc1Fc1cccnc1OCC(C)CO)C2OCCCC21c1ccco1)N1CCCC2CCOc1cccnc1-n1cncn1c1cccc(Cl)c1-n1cncn12CCCC1CC1(O)CCCOc1cccnc1Fc1cccnc1-n1cncn1c1ccco1)N1CCCC1Cc1cccc(C)n1(Cl)Cl(Cl)Cl)N1CC2CC=C1CCc2ccccc2C1
Fc1cccc(F)c1Cl)N1CCCC1COCC1CC1Cc1ccn[nH]1Cc1cccc(C)n12(=O)=O2(=O)=O)N1CCOCC2Oc1cccnc1-n1cncn1c1ccncc1)N1CCOCC1Cc1ccc2c(c1)OCO22N=C1CCOC1=O)N1CC2CC=CCC(3)C2Cc1ccccc1Fc1ccncc1Cc1cccc(F)c1)N1CCCC1c1ccc2c(c1)OCCCO22Cc1ccccc1OCc1ccncc1)NCCCOc1ccccc1c1cccnc1Clc1ccncc1Brc1ccncc1N

[14:25:07] SMILES Parse Error: extra close parentheses while parsing: C1CC(=O)N(Cc2ccccn2)C1Fc1cccnc1Oc1cccnc1F2CCC2O2O2C31OCC(C)C#N)NCC(F)(F)FFc1ccc(C2(COc1cccnc1Fc1cccc(F)c1C#N4CCOCC1CC1(Cl)Cl)N1CCCCCC1c1cccnc1Cc1cccc(F)c1NCC(O)COCC1CC1Cc1cccc(F)c1NC2=Oc1cccnc1Fc1cccnc1N1CCOCC1c1cccnc1Fc1cccc(Cl)c1C(C)=Oc1cccnc1N1CCCCC1=O)N1CCc2cc(F)ccc2C1=O)N(Cc1cccc(OC2CCC2OCC(C)(C)OC2CC1CN(S(=O)(=O)N(C)C)C3Oc1ncccc1Fc1ccc(C2=Cc1ccccc1O)N1CC2CCCCC2COCC1CN(C(=O)c1ccc[nH]1)Cc1cccnc1)N1CCC1COCC1CC1(Cl)ClCc1cccc(F)c1Fc1cccnc1-n1cncn1c1ccncc1Fc1ccc(C2CN(Cc3ccncc3)CC1c1cccc(F)c1)N1CCCC1c1nc(C2CC2)no1)N1CCC(n2cnccc1-c1ccncc1Fc1cccnc1OCC(C)CO)C2OCCCC21c1ccco1)N1CCCC2CCOc1cccnc1-n1cncn1c1cccc(Cl)c1-n1cncn12CCCC1CC1(O)CCCOc1cccnc1Fc1cccnc1-n1cncn1c1ccco1)N1CCCC1Cc1cccc(C)n1(Cl)Cl(Cl)Cl)N1CC2CC=C1CCc2ccccc2C1
[14:25:07] SMILES Parse Error: Failed parsing SMILES 'C1CC(=O)N(Cc2ccccn2)C1Fc1cccnc1Oc1cccnc1F2CCC2O2O2C31OCC(C)C#N)NCC(F)(F)FFc1ccc(C2(COc1cccnc1Fc1cccc(F)c1C#N4CCOCC1CC1(Cl)Cl)N1CCCCCC1c1cccnc1Cc1cccc

KeyboardInterrupt: 

In [None]:
moses_canonical["mol"] = mol_from_moses_can
moses_canonical["error type if not mol"] = error_vocab_moses_can
error_type_moses_can = moses_canonical["error type if not mol"].unique()

ValueError: Length of values (24) does not match length of index (1000)

In [None]:
error_count_moses_can = [moses_canonical[moses_canonical["error type if not mol"] == k ]["error type if not mol"].count() for k in error_type_moses_can ]

In [None]:
plt.xlabel("error type ")
plt.ylabel("occurence")
plt.grid()
plt.bar(error_type_moses_can,error_count_moses_can)