In [4]:
import pandas as pd
from rdkit import Chem

# Função para canonizar SMILES
def canonize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
        else:
            return None
    except:
        return None

# Função para remover SMILES inválidos
def remove_invalid_smiles(df, smiles_col):
    invalid_smiles = df[df[smiles_col].apply(lambda x: canonize_smiles(x) is None)]
    invalid_smiles['Reason'] = 'Invalid SMILES'
    df = df[df[smiles_col].apply(lambda x: canonize_smiles(x) is not None)]
    return df, invalid_smiles

# Função para remover duplicatas discordantes
def remove_discordant_duplicates(df, smiles_col, activity_col):
    # Canonizar os SMILES
    df['canonical_smiles'] = df[smiles_col].apply(canonize_smiles)
    
    # Encontrar duplicatas discordantes
    duplicates = df.duplicated(subset=['canonical_smiles', activity_col], keep=False)
    discordant = df[duplicates].groupby('canonical_smiles')[activity_col].nunique() > 1
    discordant_smiles = discordant[discordant].index
    
    # Registrar duplicatas discordantes
    discordant_duplicates = df[df['canonical_smiles'].isin(discordant_smiles)]
    discordant_duplicates['Reason'] = 'Discordant Duplicate'
    
    # Registrar duplicatas concordantes
    concordant_duplicates = df[duplicates & ~df['canonical_smiles'].isin(discordant_smiles)]
    concordant_duplicates['Reason'] = 'Concordant Duplicate'
    
    # Remover duplicatas discordantes e concordantes
    df = df[~df['canonical_smiles'].isin(discordant_smiles)]
    df = df.drop_duplicates(subset=['canonical_smiles', activity_col], keep='first')
    
    # Concatenar todas as remoções
    removed_smiles = pd.concat([discordant_duplicates, concordant_duplicates])
    
    # Remover a coluna de SMILES canonizados
    df = df.drop(columns=['canonical_smiles'])
    
    return df, removed_smiles

# Função para criar o log
def create_log(removed_df, log_file='log.txt'):
    invalid_count = len(removed_df[removed_df['Reason'] == 'Invalid SMILES'])
    discordant_count = len(removed_df[removed_df['Reason'] == 'Discordant Duplicate'])
    concordant_count = len(removed_df[removed_df['Reason'] == 'Concordant Duplicate'])
    
    with open(log_file, 'w') as f:
        f.write(f"Invalid SMILES removed: {invalid_count}\n")
        f.write(f"Discordant duplicates removed: {discordant_count}\n")
        f.write(f"Concordant duplicates removed: {concordant_count}\n")

# Função principal para executar o processo completo
def process_smiles(df, smiles_col, activity_col, log_file='log.txt'):
    # Remover SMILES inválidos
    df, invalid_smiles = remove_invalid_smiles(df, smiles_col)
    
    # Remover duplicatas discordantes e concordantes
    df, removed_smiles = remove_discordant_duplicates(df, smiles_col, activity_col)
    
    # Concatenar todas as remoções
    removed_smiles = pd.concat([invalid_smiles, removed_smiles])
    
    # Criar o log
    create_log(removed_smiles, log_file)
    
    return df, removed_smiles



In [5]:
df = pd.read_excel(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Skin\DADOS\Binary\RAW_Skin_data_LLNA_Binary.xlsx")
df

Unnamed: 0,CAS No,Canonical SMILES,LLNA_EC3,binary,Source
0,18479-49-7,CC(C)CCCC(C)(O)C=C,,0,ECHA
1,93-84-5,[O-][N+](=O)c1ccc2NC(=O)Nc2c1,,0,ECHA
2,38725-13-2,CC(C)CCCCCCN(CCCCCCC(C)C)CCCCCCC(C)C,,1,ECHA
3,49701-24-8,CN[S](=O)(=O)c1cc(OC)c(N)cc1OC,,0,ECHA
4,3874-54-2,Fc1ccc(cc1)C(=O)CCCCl,,1,ECHA
...,...,...,...,...,...
3314,122-78-1,O=CCc1ccccc1,4.7,0,CosEU
3315,122-78-1,O=CCc1ccccc1,3.0,1,CosEU
3316,122-78-1,O=CCc1ccccc1,8.8,0,CosEU
3317,17369-59-4,CC/C=C/1OC(=O)c2ccccc12,3.7,0,CosEU


In [7]:

# Executar o processo completo
df_cleaned, removed_smiles = process_smiles(df, smiles_col='Canonical SMILES', activity_col='binary')

# Salvar os resultados
df_cleaned.to_csv('cleaned_smiles.csv', index=False)
removed_smiles.to_csv('removed_smiles.csv', index=False)

print("Process completed. Check 'log.txt' for details.")
print(df_cleaned)
print(removed_smiles)

[14:50:07] SMILES Parse Error: syntax error while parsing: O|[Zn]|O
[14:50:07] SMILES Parse Error: Failed parsing SMILES 'O|[Zn]|O' for input: 'O|[Zn]|O'
[14:50:07] SMILES Parse Error: syntax error while parsing: O|[Zn]|O
[14:50:07] SMILES Parse Error: Failed parsing SMILES 'O|[Zn]|O' for input: 'O|[Zn]|O'
[14:50:07] SMILES Parse Error: syntax error while parsing: O|[Co](|O)|O
[14:50:07] SMILES Parse Error: Failed parsing SMILES 'O|[Co](|O)|O' for input: 'O|[Co](|O)|O'
[14:50:07] SMILES Parse Error: syntax error while parsing: [Na+].[Na+].[Cu]|1|OC(=O)CN(CCN(CC(O|1)=O)CC([O-])=O)CC([O-])=O
[14:50:07] SMILES Parse Error: Failed parsing SMILES '[Na+].[Na+].[Cu]|1|OC(=O)CN(CCN(CC(O|1)=O)CC([O-])=O)CC([O-])=O' for input: '[Na+].[Na+].[Cu]|1|OC(=O)CN(CCN(CC(O|1)=O)CC([O-])=O)CC([O-])=O'
[14:50:07] SMILES Parse Error: syntax error while parsing: N.N.O|[Fe]|1|OC(=O)CN(CCN(CC(O)=O)CC(O)=O)CC(O|1)=O
[14:50:07] SMILES Parse Error: Failed parsing SMILES 'N.N.O|[Fe]|1|OC(=O)CN(CCN(CC(O)=O)CC(O)=O)

Process completed. Check 'log.txt' for details.
          CAS No                         Canonical SMILES  LLNA_EC3  binary  \
0     18479-49-7                       CC(C)CCCC(C)(O)C=C       NaN       0   
1        93-84-5            [O-][N+](=O)c1ccc2NC(=O)Nc2c1       NaN       0   
2     38725-13-2     CC(C)CCCCCCN(CCCCCCC(C)C)CCCCCCC(C)C       NaN       1   
3     49701-24-8           CN[S](=O)(=O)c1cc(OC)c(N)cc1OC       NaN       0   
4      3874-54-2                    Fc1ccc(cc1)C(=O)CCCCl       NaN       1   
...          ...                                      ...       ...     ...   
3297    103-50-4                    C(OCc1ccccc1)c2ccccc2      6.30       0   
3298   1335-66-6  CC1CC(=CC(C)C1C=O)C.CC2C=C(C)CC(C=O)C2C      7.35       0   
3299   5462-06-6                     COc1ccc(CC(C)C=O)cc1     23.63       0   
3302  54464-57-2        CC1CC2=C(CC1(C)C(C)=O)C(C)(C)CCC2     25.14       0   
3306   6658-48-6                 CC(C)Cc1ccc(CC(C)C=O)cc1      7.60       0   

   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['canonical_smiles'] = df[smiles_col].apply(canonize_smiles)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  discordant_duplicates['Reason'] = 'Discordant Duplicate'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concordant_duplicates['Reason'] = 'Concordant Duplicate'
