In [35]:
import pandas as pd
import numpy as np
from rdkit import Chem

def convert_ug_ml_to_nM(ug_ml, molecular_weight):
    """
    Converta a concentração de µg/mL para nM/mL.
    
    Parâmetros:
    ug_ml (float): Concentração em µg/mL
    molecular_weight (float): Peso molecular em g/mol
    
    Retorna:
    float: Concentração em nM/mL
    """
    # Fator de conversão de µg para g
    conversion_factor = 1e-6
    
    # Converta µg/mL para g/mL
    g_ml = ug_ml * conversion_factor
    
    # Converta g/mL para molar (M) usando o peso molecular (g/mol)
    molar_concentration = g_ml / molecular_weight
    
    # Converta M para nM (1 M = 10^9 nM)
    nM_ml = molar_concentration * 1e9
    
    return nM_ml


In [36]:
df = pd.read_excel(r"D:\OneDrive\Documentos\LabMol\QSARLit Colab\automated-qsar-framework-master\malaria\data_sets\bioactivity-18_17_57_08.xls.xlsx")
df.head()

Unnamed: 0,CMPD_CHEMBLID,MOLREGNO,PARENT_CMPD_CHEMBLID,PARENT_MOLREGNO,MOL_PREF_NAME,COMPOUND_KEY,MOLWEIGHT,ALOGP,PSA,NUM_RO5_VIOLATIONS,...,DOC_CHEMBLID,PUBMED_ID,JOURNAL,YEAR,VOLUME,ISSUE,FIRST_PAGE,CELL_ID,CELL_CHEMBL_ID,CELL_NAME
0,CHEMBL3219898,1726292,CHEMBL3219898,1726292,,17,570.06,7.68,94.13,2.0,...,CHEMBL3217722,,MedChemComm,2012,3,1.0,71,,,
1,CHEMBL3219932,1726327,CHEMBL3219932,1726327,,52,576.09,5.01,112.59,3.0,...,CHEMBL3217722,,MedChemComm,2012,3,1.0,71,,,
2,CHEMBL3219909,1726304,CHEMBL3219909,1726304,,29,616.54,7.37,112.15,2.0,...,CHEMBL3217722,,MedChemComm,2012,3,1.0,71,,,
3,CHEMBL3219918,1726313,CHEMBL3219918,1726313,,38,598.1,7.1,112.59,3.0,...,CHEMBL3217722,,MedChemComm,2012,3,1.0,71,,,
4,CHEMBL3219931,1726326,CHEMBL3219931,1726326,,51,562.07,4.94,112.59,2.0,...,CHEMBL3217722,,MedChemComm,2012,3,1.0,71,,,


In [37]:
df = df.loc[:, ['CANONICAL_SMILES', 'STANDARD_VALUE', 'STANDARD_UNITS', 'MOLWEIGHT']]
df = df.dropna(subset=['STANDARD_VALUE'])

if 'ug.mL-1' in df['STANDARD_UNITS']:
    df['STANDARD_VALUE'] = df.apply(lambda x: convert_ug_ml_to_nM(x['STANDARD_VALUE'], x['MOLWEIGHT']), axis=1)
    df['STANDARD_UNITS'] = 'nM'
else:
    df['STANDARD_UNITS'] = 'nM'
df

Unnamed: 0,CANONICAL_SMILES,STANDARD_VALUE,STANDARD_UNITS,MOLWEIGHT
0,CCN1CCN(CC1)c2nc(Nc3ccc(Nc4ccnc5cc(Cl)ccc45)cc...,2.470000e-03,nM,570.06
1,Clc1ccc2c(Nc3ccc(Nc4nc(NCCCN5CCOCC5)nc(n4)N6CC...,2.760000e-03,nM,576.09
2,Clc1cccc(Nc2nc(NCCCN3CCOCC3)nc(Nc4ccc(Nc5ccnc6...,3.820000e-03,nM,616.54
3,COc1ccc(Nc2nc(Nc3ccc(Nc4ccnc5cc(Cl)ccc45)cc3)n...,5.040000e-03,nM,598.10
4,Clc1ccc2c(Nc3ccc(Nc4nc(NCCN5CCOCC5)nc(n4)N6CCO...,5.870000e-03,nM,562.07
...,...,...,...,...
1678,NC(=N)NCCCNCCCNCCCNC(=N)N,1.470000e+05,nM,272.39
1679,CC(C)[C@H](N1C(=O)c2ccccc2C1=O)C(=O)N3CCOCC3,2.056000e+05,nM,316.35
1680,CC(=CCC\C(=C\CNCCOc1cccc2ccccc12)\C)C,3.090000e+05,nM,323.47
1681,CC[C@H](C)[C@H](N)C(=O)N[C@@H](CC(C)C)C(=O)NCC...,5.190000e+05,nM,784.98


In [38]:
for a in df['STANDARD_VALUE']:
    a/1000
    df['STANDARD_UNITS'] = 'uM'

df['pI50 (uM)'] = -np.log10(df['STANDARD_VALUE'] / 1e6)
df['binary'] = df['pI50 (uM)'].apply(lambda x: 1 if x >= 6 else 0)
df

Unnamed: 0,CANONICAL_SMILES,STANDARD_VALUE,STANDARD_UNITS,MOLWEIGHT,pI50 (uM),binary
0,CCN1CCN(CC1)c2nc(Nc3ccc(Nc4ccnc5cc(Cl)ccc45)cc...,2.470000e-03,uM,570.06,8.607303,1
1,Clc1ccc2c(Nc3ccc(Nc4nc(NCCCN5CCOCC5)nc(n4)N6CC...,2.760000e-03,uM,576.09,8.559091,1
2,Clc1cccc(Nc2nc(NCCCN3CCOCC3)nc(Nc4ccc(Nc5ccnc6...,3.820000e-03,uM,616.54,8.417937,1
3,COc1ccc(Nc2nc(Nc3ccc(Nc4ccnc5cc(Cl)ccc45)cc3)n...,5.040000e-03,uM,598.10,8.297569,1
4,Clc1ccc2c(Nc3ccc(Nc4nc(NCCN5CCOCC5)nc(n4)N6CCO...,5.870000e-03,uM,562.07,8.231362,1
...,...,...,...,...,...,...
1678,NC(=N)NCCCNCCCNCCCNC(=N)N,1.470000e+05,uM,272.39,0.832683,0
1679,CC(C)[C@H](N1C(=O)c2ccccc2C1=O)C(=O)N3CCOCC3,2.056000e+05,uM,316.35,0.686977,0
1680,CC(=CCC\C(=C\CNCCOc1cccc2ccccc12)\C)C,3.090000e+05,uM,323.47,0.510042,0
1681,CC[C@H](C)[C@H](N)C(=O)N[C@@H](CC(C)C)C(=O)NCC...,5.190000e+05,uM,784.98,0.284833,0


In [40]:
df.to_csv(r"D:\OneDrive\Documentos\LabMol\QSARLit Colab\automated-qsar-framework-master\malaria\data_sets\bioactivity_data_prepared.csv", index=False)