Installing Requirements

In [2]:
pip install molvs
pip install pandasql
from molvs import standardize_smiles
import pandasql

SyntaxError: invalid syntax (1550266394.py, line 1)

Screening & Filtering

In [3]:
def filter_data(files):
    import pandas as pd
    import numpy as np
    # initiate data_main, main data frame that will be used to aggregate all csv
    # only use chembl ID, smile, and standard value columns
    data_main = pd.DataFrame({'Assay ChEMBL ID':[], 'Molecule ChEMBL ID':[], 'Smiles':[], 'Standard Relation':[], 'Standard Value':[], 'Molecular Weight':[]})
    # for every csv downloaded, do these following tasks
    for file in files:
        # read data
        data = pd.read_csv(file, sep=';')
        # drop rows if Smiles or Standard Value is null
        removed_null = data.dropna(subset=['Smiles', 'Standard Value', 'Standard Relation'])
        # slice data from initial data frame. only use columns in data_main
        sliced = removed_null[data_main.columns]
        # filter the standard relation to only "="
        sliced = sliced[sliced['Standard Relation'].str.contains('=')]
        # filter 100 < MW < 900 (Small molecules)
        sliced = sliced[(sliced['Molecular Weight'] > 100) & (sliced['Molecular Weight'] < 750)]
        # convert IC50 to pIC50
        # convert IC50 from nM to M
        sliced.loc[:, ['Standard Value']] = sliced[['Standard Value']].apply(lambda x: x*10**-9, axis=1)
        # transform to pIC50
        sliced.loc[:, ['Standard Value']] = -np.log10(sliced[['Standard Value']])
        # join data
        data_main = pd.concat([data_main, sliced])
    # drop duplicates according to Smiles column
    # data_main = data_main.drop_duplicates(subset=['Smiles'])
    return data_main[['Assay ChEMBL ID', 'Molecule ChEMBL ID', 'Smiles', 'Standard Value']]

In [None]:
# store result of data processing in df_main
df_main = filter_data(file_names)

Molecular Standardization

In [None]:
def std_data(data: pd.DataFrame):
    def std_smi(smiles):
        std = standardize_smiles(smiles)
        if std != smiles: print(f"Successfully standardized smiles: {std}")
        return std
    data['Smiles'] = data['Smiles'].apply(std_smi)
    return data

In [None]:
df_std = std_data(data_new)

Grouping & Sorting (2 Layers)

In [None]:
def sorting_data(data):
    # import library to handle sql query (not suitable to pandas)
    from pandasql import sqldf
    # transform to list > connected with '_' (common separator style in sql)
    df_cols = ['_'.join(col.split(' ')) for col in data.columns]
    # copy the data frame to df_mod
    df_mod = data.copy()
    # change column name
    df_mod.columns = df_cols
    query = """
    SELECT
    Assay_ChEMBL_ID,
    Smiles,
    MAX(Standard_Value) AS pIC50
    FROM df_mod
    -- Agregate based on the assay procedure and smiles respectively, in each assay > look for the max value
    GROUP BY Assay_ChEMBL_ID, Smiles
    -- Sort by the size of assay procedure
    ORDER BY COUNT(Assay_ChEMBL_ID) DESC;
    """
    df_sorted = sqldf(query)
    return df_sorted

In [None]:
df_sort = sorting_data(df_std)

Executing Selective Cleaning

In [None]:
def selective_removal(data):
    print(f'Before dropping duplicates: {data.shape}')
    data = data.drop_duplicates(subset=['Smiles'])
    print(f'After dropping duplicates: {data.shape}')
    return data

In [None]:
df_maxIC50 = selective_removal(df_sort)

In [None]:
df_maxIC50[['Smiles', 'pIC50']].to_csv('data_maxIC50_cleaned.csv', index=False)