In [2]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem

def load_datasets(chembl_type):
    """Load the datasets from specified CSV files based on ChEMBL type."""
    current_directory = os.getcwd()
    admetica_folder = os.path.abspath(os.path.join(current_directory, '..'))
    
    # Dynamically construct file paths for each ChEMBL type
    nx_path = os.path.join(current_directory, 'novartis/novartis_data.csv')
    admetica_path = os.path.join(admetica_folder, f'ADMET/metabolism/cyp{chembl_type.lower()}-inhibitor/cyp{chembl_type.lower()}-inhibitor_curated.csv')
    chembl_path = os.path.join(current_directory, 'novartis', 'cyp', chembl_type, f'ChEMBL_30_{chembl_type}.csv')

    # Load datasets
    nx_dataset = pd.read_csv(nx_path)
    admetica_dataset = pd.read_csv(admetica_path)
    chembl_dataset = pd.read_csv(chembl_path)
    
    return nx_dataset, admetica_dataset, chembl_dataset

def canonicalize(smiles_list):
    """Canonicalize a list of SMILES strings."""
    # Convert each SMILES string to its canonical form and return a set of unique canonical SMILES
    return {Chem.CanonSmiles(smiles) for smiles in smiles_list}

def compare_smiles(list1, list2):
    """Compare two lists of SMILES and return common canonicalized representations."""
    # Canonicalize both lists
    canon_list1 = canonicalize(list1)
    canon_list2 = canonicalize(list2)
    
    # Find common canonical SMILES between the two lists
    common_smiles = canon_list1.intersection(canon_list2)
    
    return common_smiles

def filter_chembl_data(chembl_dataset, common_canon_smiles):
    """Filter the ChEMBL dataset based on common SMILES and process the values."""
    # Filter the dataset to include only entries with common canonical SMILES
    filtered_chembl_dataset = chembl_dataset[chembl_dataset['molecule_smiles'].isin(common_canon_smiles)]
    
    # Remove duplicates, giving priority to IC50
    filtered_chembl_dataset['priority'] = filtered_chembl_dataset['Type'].apply(lambda x: 0 if x == 'IC50' else 1)
    filtered_chembl_dataset = filtered_chembl_dataset.sort_values(by=['priority', 'Type'], ascending=[True, True])
    filtered_chembl_dataset = filtered_chembl_dataset.drop_duplicates(subset=['molecule_smiles'], keep='first')

    # Remove specific unwanted types from the dataset
    remove_values = ['Drug metabolism', 'FC', 'Retention_time', 'T1/2', 'mechanism based inhibition', 'Stability']
    filtered_chembl_dataset = filtered_chembl_dataset[~filtered_chembl_dataset['Type'].isin(remove_values)]
    filtered_chembl_dataset = filtered_chembl_dataset.dropna(subset=['Unit'])

    #Remove specific unwanted units from the dataset
    remove_units = ['uL/min', '10\'-4/min']
    filtered_chembl_dataset = filtered_chembl_dataset[~filtered_chembl_dataset['Unit'].isin(remove_units)]
    
    # Filter out 'Activity' entries that do not have the correct unit
    filtered_chembl_dataset = filtered_chembl_dataset[
        ~((filtered_chembl_dataset['Type'] == 'Activity') & (filtered_chembl_dataset['Unit'] != 'uM'))
    ]

    # Process 'Value' based on 'Type' to create a new column
    filtered_chembl_dataset['Processed_Value'] = filtered_chembl_dataset.apply(process_value, axis=1)
    
    # Drop rows with NaN values in 'Processed_Value'
    filtered_chembl_dataset = filtered_chembl_dataset.dropna(subset=['Processed_Value'])

    # Classify entries based on the processed values (binary classification)
    filtered_chembl_dataset['Class'] = filtered_chembl_dataset['Processed_Value'].apply(lambda x: 1 if x < 3 else 0)

    return filtered_chembl_dataset

def process_value(row):
    """Apply different operations to 'Value' based on 'Type'."""
    type_val = row['Type']
    value = row['Value']
    
    # Attempt to convert value to float; if unsuccessful, return NaN
    try:
        value = float(value)  # Convert value to float
    except (ValueError, TypeError):
        return np.nan  # Return NaN if conversion fails
    
    # Perform operations based on the 'Type' of measurement
    if type_val in ['IC50', 'AC50', 'KI', 'Potency']:
        return value / 1000  # Convert to uM for these types
    elif type_val == 'Inhibition':
        return 1 if value > 50 else 0  # Binary classification based on inhibition percentage
    elif type_val == 'Activity':
        return value  # Keep the original value for Activity
    else:
        return value  # Return the value as is for other types

def process_all_chembl():
    """Iterate over different ChEMBL datasets and apply the pipeline."""
    chembl_types = ['2C9', '3A4', '2D6']  # List of ChEMBL types
    
    for chembl_type in chembl_types:
        print(f"\nProcessing ChEMBL {chembl_type}...")
        
        # Load datasets for the current ChEMBL type
        nx_dataset, admetica_dataset, chembl_dataset = load_datasets(chembl_type)

        # Extract SMILES lists from each dataset
        nx_smiles = nx_dataset['smiles'].tolist()
        chembl_smiles = chembl_dataset['molecule_smiles'].tolist()
        admetica_smiles = admetica_dataset['smiles'].tolist()

        # Compare and find common SMILES between NX and ChEMBL datasets
        common_canon_smiles = compare_smiles(nx_smiles, chembl_smiles)
        print(f"Number of common SMILES between NX and ChEMBL: {len(common_canon_smiles)}")

        # Filter the ChEMBL dataset based on the common SMILES found
        filtered_chembl_dataset = filter_chembl_data(chembl_dataset, common_canon_smiles)
        print(f"Filtered ChEMBL dataset shape: {filtered_chembl_dataset.shape}")

        # Further filtering against the Admetica dataset to exclude overlapping SMILES
        common_canon_smiles_admetica = compare_smiles(admetica_smiles, filtered_chembl_dataset['molecule_smiles'].tolist())
        filtered_chembl_dataset = filtered_chembl_dataset[~filtered_chembl_dataset['molecule_smiles'].isin(common_canon_smiles_admetica)]
        print(f"Filtered ChEMBL dataset shape after removing common SMILES with Admetica: {filtered_chembl_dataset.shape}")

        # Class distribution in the filtered dataset
        class_counts = filtered_chembl_dataset['Class'].value_counts()
        print(f"Class distribution:\n{class_counts}")

        # Final processing for NX dataset based on remaining common SMILES
        filtered_chembl_smiles = filtered_chembl_dataset['molecule_smiles'].tolist()
        common_canon_smiles_final = compare_smiles(filtered_chembl_smiles, nx_smiles)
        nx_filtered = nx_dataset[nx_dataset['smiles'].isin(common_canon_smiles_final)]
    
        # Sort the NX dataset based on the order of filtered ChEMBL SMILES
        order = {smile: idx for idx, smile in enumerate(filtered_chembl_smiles) if smile in common_canon_smiles_final}
        nx_sorted = nx_filtered.sort_values(by='smiles', key=lambda x: x.map(order)).reset_index(drop=True)

        # Assuming chembl_dataset is a path to the original ChEMBL file
        chembl_dir = os.path.join(os.getcwd(), 'novartis', 'cyp', chembl_type)

        # Save the final datasets in the same directory as chembl_dataset
        filtered_chembl_dataset.to_csv(os.path.join(chembl_dir, f'ChEMBL_30_{chembl_type}_processed.csv'), index=False)
        nx_sorted.to_csv(os.path.join(chembl_dir, f'novartis_{chembl_type}_processed.csv'), index=False)

if __name__ == "__main__":
    process_all_chembl()


Processing ChEMBL 2C9...
Number of common SMILES between NX and ChEMBL: 918
Filtered ChEMBL dataset shape: (514, 25)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_chembl_dataset['priority'] = filtered_chembl_dataset['Type'].apply(lambda x: 0 if x == 'IC50' else 1)


Filtered ChEMBL dataset shape after removing common SMILES with Admetica: (464, 25)
Class distribution:
Class
0    329
1    135
Name: count, dtype: int64

Processing ChEMBL 3A4...
Number of common SMILES between NX and ChEMBL: 1279
Filtered ChEMBL dataset shape: (841, 25)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_chembl_dataset['priority'] = filtered_chembl_dataset['Type'].apply(lambda x: 0 if x == 'IC50' else 1)


Filtered ChEMBL dataset shape after removing common SMILES with Admetica: (788, 25)
Class distribution:
Class
0    549
1    239
Name: count, dtype: int64

Processing ChEMBL 2D6...


  chembl_dataset = pd.read_csv(chembl_path)


Number of common SMILES between NX and ChEMBL: 1092
Filtered ChEMBL dataset shape: (715, 25)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_chembl_dataset['priority'] = filtered_chembl_dataset['Type'].apply(lambda x: 0 if x == 'IC50' else 1)


Filtered ChEMBL dataset shape after removing common SMILES with Admetica: (639, 25)
Class distribution:
Class
0    444
1    195
Name: count, dtype: int64
