In [3]:
import pandas as pd
import rdkit
from rdkit import Chem

In [1]:
from multiprocessing import Pool
import numpy as np

def process_smiles_batch(smiles_batch):
    """Traite un batch de SMILES"""
    inchies = []
    for smiles in smiles_batch:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            inchies.append(Chem.inchi.MolToInchi(mol))
    return inchies

def parallel_process(smiles_series, n_processes=4, batch_size=1000):
    """Traitement parallèle par batches"""
    batches = [smiles_series[i:i+batch_size] 
               for i in range(0, len(smiles_series), batch_size)]
    
    with Pool(n_processes) as pool:
        results = pool.map(process_smiles_batch, batches)
    
    # Aplatir les résultats
    inchies = [inchi for batch_result in results for inchi in batch_result]
    return inchies

In [2]:
import pandas as pd
from rdkit import Chem
import gc

def process_smiles_to_inchi_chunks(smiles_series, chunk_size=1000):
    """Traite les SMILES par petits blocs pour économiser la mémoire"""
    inchies = []
    
    for i in range(0, len(smiles_series), chunk_size):
        chunk = smiles_series.iloc[i:i+chunk_size]
        
        # Convertir le chunk en molécules
        mols = chunk.apply(Chem.MolFromSmiles)
        
        # Convertir en InChI et filtrer les None
        chunk_inchies = [Chem.inchi.MolToInchi(mol) for mol in mols if mol is not None]
        inchies.extend(chunk_inchies)
        
        # Libérer la mémoire
        del mols, chunk_inchies
        gc.collect()
        
        if i % (chunk_size * 10) == 0:
            print(f"Traité {i}/{len(smiles_series)} SMILES")
    
    return inchies


In [3]:

# Utilisation
moses_can_cl = pd.read_csv("../data/generated/moses_canonical_CL_1.csv")["SMILES"]
moses_inchiescan_cl = process_smiles_to_inchi_chunks(moses_can_cl)

[16:27:32] SMILES Parse Error: extra close parentheses while parsing: Cn1nnnc1SCC(=O)NC(C)(C#N)C1CC1)c1ccccc1
[16:27:32] SMILES Parse Error: check for mistakes around position 31:
[16:27:32] C(=O)NC(C)(C#N)C1CC1)c1ccccc1
[16:27:32] ~~~~~~~~~~~~~~~~~~~~^
[16:27:32] SMILES Parse Error: Failed parsing SMILES 'Cn1nnnc1SCC(=O)NC(C)(C#N)C1CC1)c1ccccc1' for input: 'Cn1nnnc1SCC(=O)NC(C)(C#N)C1CC1)c1ccccc1'
[16:27:32] SMILES Parse Error: extra close parentheses while parsing: CS(=O)(=O)NCC(=O)Nc1cccc(N2CCCCC2=O)c1)C1CC1
[16:27:32] SMILES Parse Error: check for mistakes around position 39:
[16:27:32] c1cccc(N2CCCCC2=O)c1)C1CC1
[16:27:32] ~~~~~~~~~~~~~~~~~~~~^
[16:27:32] SMILES Parse Error: Failed parsing SMILES 'CS(=O)(=O)NCC(=O)Nc1cccc(N2CCCCC2=O)c1)C1CC1' for input: 'CS(=O)(=O)NCC(=O)Nc1cccc(N2CCCCC2=O)c1)C1CC1'
[16:27:32] SMILES Parse Error: extra close parentheses while parsing: Cc1ccccc1OCC(=O)NCCCn1cccc1)c1ccccn1
[16:27:32] SMILES Parse Error: check for mistakes around position 28:
[16:27:

Traité 0/10000 SMILES


[16:27:33] SMILES Parse Error: extra close parentheses while parsing: C=CCNC(=O)Cn1nc(-c2cccs2)oc1=O)c1cccc(C)c1
[16:27:33] SMILES Parse Error: check for mistakes around position 31:
[16:27:33] Cn1nc(-c2cccs2)oc1=O)c1cccc(C)c1
[16:27:33] ~~~~~~~~~~~~~~~~~~~~^
[16:27:33] SMILES Parse Error: Failed parsing SMILES 'C=CCNC(=O)Cn1nc(-c2cccs2)oc1=O)c1cccc(C)c1' for input: 'C=CCNC(=O)Cn1nc(-c2cccs2)oc1=O)c1cccc(C)c1'
[16:27:33] SMILES Parse Error: syntax error while parsing: Cc1nnnn1-c1cc((NC(=O)c2ccc(C)[nH]c2=O)ccc1F)c1
[16:27:33] SMILES Parse Error: check for mistakes around position 15:
[16:27:33] Cc1nnnn1-c1cc((NC(=O)c2ccc(C)[nH]c2=O)ccc
[16:27:33] ~~~~~~~~~~~~~~^
[16:27:33] SMILES Parse Error: Failed parsing SMILES 'Cc1nnnn1-c1cc((NC(=O)c2ccc(C)[nH]c2=O)ccc1F)c1' for input: 'Cc1nnnn1-c1cc((NC(=O)c2ccc(C)[nH]c2=O)ccc1F)c1'
[16:27:33] SMILES Parse Error: extra close parentheses while parsing: CC(C(=O)Nc1ccccc1)N1C(=O)c2cccnc2S1(=O)=O)CC1
[16:27:33] SMILES Parse Error: check for mistakes ar

In [4]:

moses_clearsmi_cl = pd.read_csv("../data/generated/moses_ClearSMILES_CL_1.csv")["SMILES"]

moses_training_data = pd.read_csv("../data/training_data/moses_canonical.csv")
moses_training_data = moses_training_data[moses_training_data["SPLIT"]=='train']["SMILES"]

In [5]:
moses_training_data

0            CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1
1              CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1
3               Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO
4                  Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C
5                    CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O
                             ...                   
1936957                        N#Cc1c(Br)cnc(N)c1Br
1936958          COC(=O)c1cc(CNC(=O)OC(C)(C)C)ccc1C
1936959                      NC(=O)c1ccc2ccccc2c1Br
1936960    CC(=O)Nc1cccc(-c2nc3cc(C)ccc3[nH]c2=O)c1
1936961     CC(NC(=O)OC(C)(C)C)c1nc(CO)nn1Cc1ccccc1
Name: SMILES, Length: 1584663, dtype: object

In [5]:

moses_inchies_clearsmi_cl = process_smiles_to_inchi_chunks(moses_clearsmi_cl)



[16:32:14] SMILES Parse Error: extra close parentheses while parsing: CC1SC=C(N=1)C(=O)NC(C)C1C=NN(C)C=1C)C1C=CC(F)=CC=1
[16:32:14] SMILES Parse Error: check for mistakes around position 36:
[16:32:14] O)NC(C)C1C=NN(C)C=1C)C1C=CC(F)=CC=1
[16:32:14] ~~~~~~~~~~~~~~~~~~~~^
[16:32:14] SMILES Parse Error: Failed parsing SMILES 'CC1SC=C(N=1)C(=O)NC(C)C1C=NN(C)C=1C)C1C=CC(F)=CC=1' for input: 'CC1SC=C(N=1)C(=O)NC(C)C1C=NN(C)C=1C)C1C=CC(F)=CC=1'
[16:32:14] SMILES Parse Error: extra close parentheses while parsing: C1N(C)N=CC=1CNC(=O)NC1=CN=CC=C1C)N1CCCC1
[16:32:14] SMILES Parse Error: check for mistakes around position 33:
[16:32:14] CNC(=O)NC1=CN=CC=C1C)N1CCCC1
[16:32:14] ~~~~~~~~~~~~~~~~~~~~^
[16:32:14] SMILES Parse Error: Failed parsing SMILES 'C1N(C)N=CC=1CNC(=O)NC1=CN=CC=C1C)N1CCCC1' for input: 'C1N(C)N=CC=1CNC(=O)NC1=CN=CC=C1C)N1CCCC1'
[16:32:14] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[16:32:14] SMILES Parse Error: extra close parentheses while parsing: N1=NN=CN1C

Traité 0/10000 SMILES


[16:32:14] SMILES Parse Error: extra close parentheses while parsing: F1C=CC=CC=1CN1CCSC2C1=CC(N)=CC=2)=O
[16:32:14] SMILES Parse Error: check for mistakes around position 33:
[16:32:14] N1CCSC2C1=CC(N)=CC=2)=O
[16:32:14] ~~~~~~~~~~~~~~~~~~~~^
[16:32:14] SMILES Parse Error: Failed parsing SMILES 'F1C=CC=CC=1CN1CCSC2C1=CC(N)=CC=2)=O' for input: 'F1C=CC=CC=1CN1CCSC2C1=CC(N)=CC=2)=O'
[16:32:14] SMILES Parse Error: extra close parentheses while parsing: CC1CCCC(C)N1C(=O)NC1=CC=CC=C1OC(F)C)C(OC)=O
[16:32:14] SMILES Parse Error: check for mistakes around position 36:
[16:32:14] O)NC1=CC=CC=C1OC(F)C)C(OC)=O
[16:32:14] ~~~~~~~~~~~~~~~~~~~~^
[16:32:14] SMILES Parse Error: Failed parsing SMILES 'CC1CCCC(C)N1C(=O)NC1=CC=CC=C1OC(F)C)C(OC)=O' for input: 'CC1CCCC(C)N1C(=O)NC1=CC=CC=C1OC(F)C)C(OC)=O'
[16:32:14] SMILES Parse Error: extra close parentheses while parsing: N1C=CC=C1CC(=O)NCC1=CC=CN=C1)OC1CCCC1
[16:32:14] SMILES Parse Error: check for mistakes around position 29:
[16:32:14] 1CC(=O)NCC1=CC

In [None]:

moses_inchies_training = process_smiles_to_inchi_chunks(moses_training_data)

[16:34:35] SMILES Parse Error: extra close parentheses while parsing: Cn1nnnc1SCC(=O)NC(C)(C#N)C1CC1)c1ccccc1
[16:34:35] SMILES Parse Error: check for mistakes around position 31:
[16:34:35] C(=O)NC(C)(C#N)C1CC1)c1ccccc1
[16:34:35] ~~~~~~~~~~~~~~~~~~~~^
[16:34:35] SMILES Parse Error: Failed parsing SMILES 'Cn1nnnc1SCC(=O)NC(C)(C#N)C1CC1)c1ccccc1' for input: 'Cn1nnnc1SCC(=O)NC(C)(C#N)C1CC1)c1ccccc1'
[16:34:35] SMILES Parse Error: extra close parentheses while parsing: CS(=O)(=O)NCC(=O)Nc1cccc(N2CCCCC2=O)c1)C1CC1
[16:34:35] SMILES Parse Error: check for mistakes around position 39:
[16:34:35] c1cccc(N2CCCCC2=O)c1)C1CC1
[16:34:35] ~~~~~~~~~~~~~~~~~~~~^
[16:34:35] SMILES Parse Error: Failed parsing SMILES 'CS(=O)(=O)NCC(=O)Nc1cccc(N2CCCCC2=O)c1)C1CC1' for input: 'CS(=O)(=O)NCC(=O)Nc1cccc(N2CCCCC2=O)c1)C1CC1'
[16:34:35] SMILES Parse Error: extra close parentheses while parsing: Cc1ccccc1OCC(=O)NCCCn1cccc1)c1ccccn1
[16:34:35] SMILES Parse Error: check for mistakes around position 28:
[16:34:

done


[16:34:36] SMILES Parse Error: extra close parentheses while parsing: CNc1nn2c(=O)cc(C)[nH]c2=O)nc2s1
[16:34:36] SMILES Parse Error: check for mistakes around position 26:
[16:34:36] n2c(=O)cc(C)[nH]c2=O)nc2s1
[16:34:36] ~~~~~~~~~~~~~~~~~~~~^
[16:34:36] SMILES Parse Error: Failed parsing SMILES 'CNc1nn2c(=O)cc(C)[nH]c2=O)nc2s1' for input: 'CNc1nn2c(=O)cc(C)[nH]c2=O)nc2s1'
[16:34:36] SMILES Parse Error: extra close parentheses while parsing: CC(C)(C)NC(=O)NCC(C1CCOC1)N1CCOCC1)C(C)(C)C
[16:34:36] SMILES Parse Error: check for mistakes around position 35:
[16:34:36] NCC(C1CCOC1)N1CCOCC1)C(C)(C)C
[16:34:36] ~~~~~~~~~~~~~~~~~~~~^
[16:34:36] SMILES Parse Error: Failed parsing SMILES 'CC(C)(C)NC(=O)NCC(C1CCOC1)N1CCOCC1)C(C)(C)C' for input: 'CC(C)(C)NC(=O)NCC(C1CCOC1)N1CCOCC1)C(C)(C)C'
[16:34:36] SMILES Parse Error: extra close parentheses while parsing: Cn1nnnc1SCCCC(=O)Nc1cc(C2CC2)n[nH]1)C(F)(F)F
[16:34:36] SMILES Parse Error: check for mistakes around position 36:
[16:34:36] O)Nc1cc(C2CC2)n

done


In [15]:
len(moses_molcan_cl)

10000

In [17]:
len([Chem.inchi.MolToInchi(k) for k in moses_molcan_cl if k])











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































9328

AttributeError: 'list' object has no attribute 'apply'

In [10]:
moses_inchiescan_cl = [k for k in moses_inchiescan_cl if k not in moses_inchies_training]
moses_inchiescleasmi_cl = [k for k in moses_inchiescleasmi_cl if k not in moses_inchies_training]

In [12]:
len(moses_inchiescan_cl)

0