In [None]:
# General python libraries and packages
import pandas as pd
from tqdm.auto import tqdm
import ast
import warnings
warnings.filterwarnings('ignore')
import datetime
date = datetime.date.today()
import os

# Database Accession
from chembl_webresource_client.new_client import new_client

# Data Visualization
import matplotlib.pyplot as plt

# Processing chemical data
from rdkit.Chem import PandasTools, Draw

# Identify targets of interest 

In [None]:
def get_single_protein_targets_from_chembl(xref_filter=None, eukaryotic_only=False):

    """
    Helper Function to retrieve all single proteins from ChEMBL. 
    Optionally, filter for a cross-reference annotation, e.g. the Protein Family (Pfam) ID, 
    and/or filter for eukaryotic single proteins only.
    """

    # New client to query the ChEMBL API for target information
    targets_api = new_client.target

    # Retrieve specified information for every single protein type target in the ChEMBL
    df = pd.DataFrame(targets_api.filter(target_type="SINGLE PROTEIN").only(
        "organism", "pref_name", "target_components", "target_chembl_id", "tax_id"))

    # Rename columns 
    df = df.rename(columns= {"organism"          : "Target_Organism", 
                             "pref_name"         : "Target_Name", 
                             "target_components" : "Target_Components", 
                             "target_chembl_id"  : "ChEMBL_Target_ID"})

    print(f"Number of single proteins in the ChEMBL = {len(df)}")

    # Explode the Target_Components column
    df = df.explode(column='Target_Components')

    # Extract Target_Components_XRefs
    df['Target_Components_XRefs'] = df['Target_Components'].apply(lambda x: x.get('target_component_xrefs') if isinstance(x, dict) else None)
    df = df.explode(column='Target_Components_XRefs')
    df['Target_Components_XRefs'] = df['Target_Components_XRefs'].apply(lambda x: x.get('xref_id') if isinstance(x, dict) else None)

    # Optionally filter by an ID from a cross-reference (e.g. by Pfam-ID)
    if xref_filter:
        df = df[df['Target_Components_XRefs'] == xref_filter]
        df = df.reset_index(drop=True)
        print(f"Number of single proteins after filtering for {xref_filter} = {len(df)}")

    # Extract the Target Component ID
    df['ChEMBL_Target_Component_ID'] = df['Target_Components'].apply(lambda x: x.get('component_id') if isinstance(x, dict) else None)

    # Extract the Enzyme Commission (EC) Number
    df['Target_Component_Synonyms'] = df['Target_Components'].apply(lambda x: x.get('target_component_synonyms') if isinstance(x, dict) else None)
    df = df.explode(column='Target_Component_Synonyms')
    df['Target_Component_Synonyms'] = df['Target_Component_Synonyms'].apply(lambda x: x if isinstance(x, dict) and x.get('syn_type') == 'EC_NUMBER' else None)
    df['Target_EC_Number'] = df['Target_Component_Synonyms'].apply(lambda x: x.get('component_synonym') if isinstance(x, dict) else None)
    df['Target_EC_Number'] = df['Target_EC_Number'].astype(str).str.replace('None', '')

    # Drop columns we don't need anymore
    df = df.drop(columns=['Target_Components',  'Target_Component_Synonyms', 'Target_Components_XRefs'])
    df = df.drop_duplicates().reset_index(drop=True)

    # Group by the ChEMBL_Target_ID again
    df = df.groupby('ChEMBL_Target_ID').agg(
        {**{col: 'first' for col in df.columns if col not in ['Target_EC_Number']}, 
        'Target_EC_Number': lambda x: ', '.join(x.astype(str))}
        ).reset_index(drop=True)
    
    
    if eukaryotic_only == True: 
        # Retrieve the organism class name for each organism (taxonomy ID)
        organism_api = new_client.organism
        organism_type = organism_api.filter(
        tax_id__in=list(df['tax_id']) 
        ).only("l1", "tax_id") # "class_level"  #'parent_id'
    
        organism_df = pd.DataFrame.from_records(list(tqdm(organism_type)))
        organism_df = organism_df.rename(columns={'l1' : 'Target_Organism_Class'})

        # Merge dataframes
        df = pd.merge(df, organism_df, on='tax_id', how='left')

        # The actual filtering
        df = df[df['Target_Organism_Class'] == 'Eukaryotes']
        df = df.reset_index(drop=True)

        print(f"Number of single target proteins after filtering for 'Eukaryotes' only = {len(df)}")
        
    return df

In [None]:
def get_chembl_targets(xref_filter=None, protein_class_filter=None, eukaryotic_only=False):    

    """ 
    Retrieve targets from ChEMBL via API.
    Optionally filter for a cross-reference annotation, e.g. the Protein Family (Pfam) ID, 
    and/or filter for eukaryotic single proteins only, 
    and/or filter for the protein class (e.g. protein kinases).
    """

    df = get_single_protein_targets_from_chembl(xref_filter=xref_filter, eukaryotic_only=eukaryotic_only)

    # New client to query the ChEMBL API for target component information
    target_component_api = new_client.target_component

    # Retrieve the protein classification id for the target_components from the ChEMBL
    target_component = target_component_api.filter(
        target_component_id__in=list(df['ChEMBL_Target_Component_ID'])
        ).only("component_id", "protein_classifications", "accession")

    target_component_df = pd.DataFrame.from_records(list(tqdm(target_component)))

    # Explode the protein_classifications column
    target_component_df = target_component_df.explode(column='protein_classifications').reset_index(drop=True)

    # Extract protein classification ID
    target_component_df['protein_class_id'] = target_component_df['protein_classifications'].apply(lambda x: str(x.get('protein_classification_id')) if isinstance(x, dict) else None)

    # Rename the component_id column
    target_component_df = target_component_df.rename(columns={'component_id': 'ChEMBL_Target_Component_ID'})

    df = pd.merge(df, target_component_df, on='ChEMBL_Target_Component_ID', how='left')


    if protein_class_filter: 
        df['protein_class_id'] = df['protein_class_id'].astype(int)

        # Retrieve the protein class name for each protein classification id
        proteinclass_api = new_client.protein_classification
        proteinclass = proteinclass_api.filter(
            protein_classification_id__in=list(df['protein_class_id'])
            ).only("protein_class_id", "pref_name", "protein_class_desc") # "class_level"  #'parent_id'
        
        proteinclass_df = pd.DataFrame.from_records(list(tqdm(proteinclass)))

        # Merge dataframes
        df = pd.merge(df, proteinclass_df, on='protein_class_id', how='left')
            
        # Rename columns
        df = df.rename(columns={'pref_name'          : 'Target_Protein_Class_Name',
                                'protein_class_desc' : 'Target_Protein_Class_Description', 
                                'accession'          :  'UniProt_Target_ID'})

        # Replace any double spaces by a ;
        df['Target_Protein_Class_Description'] = df['Target_Protein_Class_Description'].str.replace('  ', '; ')

        # Optionally filter, by the protein class description 
        df = df[df['Target_Protein_Class_Description'].str.contains(protein_class_filter)]
        df = df.reset_index(drop=True)
        print(f"Number of single proteins after filtering for '{protein_class_filter}' = {len(df)}")

        # Split the protein class description into its 6 levels
        df['Target_Protein_Class_Description'] = df['Target_Protein_Class_Description'].str.split('; ')
        df['Target_Protein_Class_Description'] = df['Target_Protein_Class_Description'].apply(lambda x: x + [None]*(6 - len(x)) if len(x) < 6 else x)
        df[['Target_Protein_Class_1', 'Target_Protein_Class_2', 'Target_Protein_Class_3', 'Target_Protein_Class_4', 'Target_Protein_Class_5', 'Target_Protein_Class_6']] = df['Target_Protein_Class_Description'].apply(lambda x: pd.Series(x[:6]))
    
    df = df.groupby('ChEMBL_Target_ID').agg({
    'Target_Organism' :            'first', 
    'Target_Organism_Class' :      'first', 
    'Target_Name' :                'first', 
    'ChEMBL_Target_ID'           : 'first', 
    'ChEMBL_Target_Component_ID' : 'first',
    'UniProt_Target_ID'          : 'first',
    'Target_EC_Number'          :  'first',
    'Target_Protein_Class_Name' :  lambda x: ', '.join(x.astype(str)),
    'Target_Protein_Class_1'    :  lambda x: ', '.join(x.astype(str)),
    'Target_Protein_Class_2'    :  lambda x: ', '.join(x.astype(str)),
    'Target_Protein_Class_3'    :  lambda x: ', '.join(x.astype(str)),
    'Target_Protein_Class_4'    :  lambda x: ', '.join(x.astype(str)),
    'Target_Protein_Class_5'    :  lambda x: ', '.join(x.astype(str)),
    'Target_Protein_Class_6'    :  lambda x: ', '.join(x.astype(str)),
    }).reset_index(drop=True)

    return df

## Filters
- **Target Type** = Single Protein; 
- Comprising the protein kinase domain (Pfam-ID = PF00069) (as annotated in the **xref**)
- **Protein Classification** = Enzyme and Kinase;
- **Target Organism Classification** = Eukaryotic;

In [None]:
df = get_chembl_targets(xref_filter='PF00069', 
                        protein_class_filter='enzyme; kinase; protein kinase', 
                        eukaryotic_only=True)

# Drop some columns, as we used them for filtering before 
df = df.drop(columns=['Target_Organism_Class', 'Target_Protein_Class_1', 'Target_Protein_Class_2', 'Target_Protein_Class_3'])

df.head(1)

In [None]:
# Protein classification level 4-6 labels correspond to kinase group/family/subfamily
# Rename columns accordingly
df = df.rename(columns={'Target_Protein_Class_4':'Kinase_Group', 
                        'Target_Protein_Class_5':'Kinase_Family', 
                        'Target_Protein_Class_6':'Kinase_Subfamily'})

filepath = os.path.join('Data', 'ChEMBL_Kinases')
tmp_folder = os.path.join(filepath, 'Temp')
df.to_excel(os.path.join(tmp_folder, 'ChEMBL_Kinases_Target_Data.xlsx'), index=False)

df.head(2)

# Retrieve Bioactivities 

In [None]:
def get_chembl_bioactivities(df, target_id_col):
    """
    Get bioactivities for each compound in the input DataFrame and append the infos to the input DataFrame.

    Parameters:
    - df (pandas DataFrame):  Input DataFrame containing the compounds
    - target_id_col (str):    Column name in the input DataFrame containing the ChEMBL target IDs

    """

    # Create a new bioactivities API client
    bioactivities_api = new_client.activity

    # Initialize an empty list to store the bioactivity data for each compound
    bioactivity_list = []

    # Get the bioactivities
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        bioactivities = bioactivities_api.filter(target_chembl_id=row[target_id_col]).only(
                'target_chembl_id',
                'molecule_chembl_id', 'canonical_smiles',
                'assay_chembl_id', 'assay_type', 'activity_id', # "assay_description", #"activity_comment",
                'type', 'relation", "standard_value', 'standard_units', 'pchembl_value', 'ligand_efficiency'
                #"comment"     
            )

        # Create a dataframe from the query results
        bioactivities_df = pd.DataFrame.from_records(bioactivities)
        bioactivity_list.append(bioactivities_df)

    bioactivity_df = pd.concat(bioactivity_list, ignore_index=True)

    # Extract LE value 
    bioactivity_df['ligand_efficiency'] = bioactivity_df['ligand_efficiency'].apply(lambda x: x.get('le') if x is not None else None)

    # Drop duplicates and reset index
    bioactivity_df.drop_duplicates(keep="first", inplace=True)
    bioactivity_df.reset_index(inplace=True, drop=True)

    # Rename columns
    bioactivity_df.rename(columns={
        'target_chembl_id'  : target_id_col,
        'molecule_chembl_id': 'ChEMBL_Molecule_ID', 
        'canonical_smiles'  : 'SMILES',
        'assay_chembl_id'   : 'ChEMBL_Assay_ID',
        'assay_type'        : 'ChEMBL_Assay_Type',
        'activity_id'       : 'ChEMBL_Activity_ID', 
        'type'              : 'ChEMBL_Activity_Type',
        'relation'          : 'ChEMBL_Relation', 
        'standard_value'    : 'ChEMBL_Standard_Value', 
        'standard_units'    : 'ChEMBL_Standard_Unit', 
        'pchembl_value'     : 'pChEMBL_Value',
        'ligand_efficiency' : 'Ligand_Efficiency'
    }, inplace=True)

    # Merge with the input df,  based on the target_id_col
    return pd.merge(df, bioactivity_df, on=target_id_col, how='outer')

In [None]:
# Note that the run time per target ID highly depends on the number of compounds and bioactivity values associated, 
# and thus does not scale linearly. Rough estimate = 5 h !
df = get_chembl_bioactivities(df, 'ChEMBL_Target_ID') 
df.to_excel(os.path.join(tmp_folder, 'ChEMBL_Kinases_Bioactivity_Data.xlsx'), index=False)
# df.head()

## Filter for molecules with a ligand efficiency (LE) >= 0.3 <br> and/or a pChEMBL value of >= 5.0 towards any kinase

The **pChEMBL value** corresponds to the negative logarithm of the activity, reported as 
- "standard_type" = "IC50", "XC50", "EC50", "AC50", "Ki", "Kd", "Potency", "ED50"; 
- "standard_relation" == "=" 
- "standard_units" == "nM";  <br> 

A pChEMBL value of 5 thus equals an activity of 10 µM and a pChEMBL value of 9 equals an activity of 1 nM. <br>
A commonly used cut-off for bioactive molecules is pChEMBL value > 5

The **Ligand Efficiency (LE)** sets the activity into relation to the number of non-hydrogen atoms (NHA) <br>
LE = -log(Activity) * 1.37 / NHA <br>

Bento et  al. (2014) https://doi.org/10.1093/nar/gkt1031 <br>
 "The ligand efficiencies are calculated on the standardized pChEMBL values" <br>

Schultes et al (2010)  https://doi.org/10.1016/j.ddtec.2010.11.003: <br>
Application of ligand efficiency scores to FBDD: <br>
- In general, an orally available clinical candidate possesses a potency of better than 10 nM and, if ‘Rule-of-Five’-compliant, a maximal molecular weight of 500 Da (which equals, on average, 38 HA). This means that a LE of at least 0.29 kcal mol−1 HA−1 needs to be maintained during hit optimization. 
-  For a ‘Rule-of-Three’-compliant fragment hit with less than 300 Da (that equals on average 23HA) this would result in  a LE of at least 0.36 kcal mol−1 HA−1.

In [None]:
# Convert column values to numeric type
df['Ligand_Efficiency'] = pd.to_numeric(df['Ligand_Efficiency'], errors='coerce')
df['pChEMBL_Value'] = pd.to_numeric(df['pChEMBL_Value'], errors='coerce')

# Filter for a ligand efficiency of >= 0.3 and/or a pChEMBL value of >= 5.0  towards any kinase, 
ChEMBL_df = df[(df['Ligand_Efficiency'] >= 0.3) | (df['pChEMBL_Value'] >= 5)]
ChEMBL_df = ChEMBL_df.reset_index(drop=True)

ChEMBL_df.to_excel(os.path.join(tmp_folder, 'ChEMBL_Kinases_Bioactivity_Data_Filtered.xlsx'), index=False)

print(len(ChEMBL_df))

## Retrieve more information on the assay
**ChEMBL_Assay_Confidence_Score** <br> https://chembl.gitbook.io/chembl-interface-documentation/frequently-asked-questions/chembl-data-questions#what-is-the-confidence-score
| CONFIDENCE_SCORE | DESCRIPTION                                                                 |
|------------------|-----------------------------------------------------------------------------|
| 0                | Default value - Target assignment has yet to be curated                     |
| 1                | Target assigned is non-molecular                                            |
| 3                | Target assigned is molecular non-protein target                             |
| 4                | Multiple homologous protein targets may be assigned (e.g. a PROTEIN FAMILY) |
| 5                | Multiple direct protein targets may be assigned (e.g. a PROTEIN FAMILY)     |
| 6                | Homologous protein complex subunits assigned                                |
| 7                | Direct protein complex subunits assigned                                    |
| 8                | Homologous single protein target assigned                                   |
| 9                | Direct single protein target assigned                                       |

In [None]:
def get_chembl_assay_info(df, assay_id_col):
    """
    Retrieves more information on the assay from the ChEMBL via API.

    Parameters:
    - df (pandas DataFrame):  Input DataFrame containing the ChEMBL assay IDs.
    - assay_id_col (str):     Column name in the input DataFrame containing the ChEMBL assay IDs.
    """

    # Create a new assay API client
    assay_api = new_client.assay

    # Get more information for the current ChEMBL assay ID 
    assay_info = assay_api.get(assay_chembl_id=list(df[assay_id_col])).only(
        "assay_chembl_id", "confidence_score", "confidence_description", "description") 

    # Create a dataframe from the retrieved info
    assay_df = pd.DataFrame.from_records(list(tqdm(assay_info)))

    # Drop duplicates and reset index
    assay_df.drop_duplicates(keep="first", inplace=True)
    assay_df.reset_index(drop=True, inplace=True)

    # Rename columns
    assay_df.rename(columns={
        'assay_chembl_id'        :  assay_id_col,
        'confidence_score'       : 'ChEMBL_Assay_Confidence_Score',
        'confidence_description' : 'ChEMBL_Assay_Confidence_Description',
        'description'            : 'ChEMBL_Assay_Description'
        }, inplace=True)

    # Merge with the input df, based on the target_id_col and return the merge DataFrame
    return pd.merge(df, assay_df, on=assay_id_col, how='outer')

In [None]:
ChEMBL_df = get_chembl_assay_info(ChEMBL_df, 'ChEMBL_Assay_ID') 

print(ChEMBL_df['ChEMBL_Assay_Confidence_Score'].value_counts())
print(ChEMBL_df['ChEMBL_Assay_Confidence_Description'].value_counts())

## Filter for ChEMBL_Assay_Confidence_Score == 9

In [None]:
# Filter for ChEMBL_Assay_Confidence_Score = 9 (Direct single protein target assigned) only
ChEMBL_df = ChEMBL_df[ChEMBL_df['ChEMBL_Assay_Confidence_Score'] == 9]

# Drop columns
ChEMBL_df = ChEMBL_df.drop(columns=['ChEMBL_Assay_Confidence_Score', 'ChEMBL_Assay_Confidence_Description'])

# Save to file
ChEMBL_df.to_excel(os.path.join(tmp_folder, 'ChEMBL_Kinases_Bioactivity_Data_Filtered.xlsx'), index=False)

# Retrieve more information on the cmpds

In [None]:
def get_chembl_cmpd_infos(df, molecule_id_column):
    """
    Get more information on each compound in the input DataFrame and append to the latter.

    Parameters:
    - df (pandas DataFrame): Input DataFrame containing the compounds ChEMBL IDs.
    - molecule_id_col (str): Column name in the input DataFrame containing the ChEMBL Molecule IDs

    """
    # Create a new API client
    molecule_api = new_client.molecule
    
    # Get the cmpd infos
    molecule_info = molecule_api.filter(
        molecule_chembl_id__in=list(df[molecule_id_column])).only(
            "molecule_chembl_id", "pref_name", "molecule_type")

    # Create a dataframe from the retrieved cmpd info
    compounds_df = pd.DataFrame.from_records(list(tqdm(molecule_info)))
    
    # Rename the columns
    compounds_df.rename(columns={
        "molecule_chembl_id"  : molecule_id_column,
        "pref_name"           : "Molecule_Name", 
        "molecule_type"       : "Molecule_Type", 
        }, inplace=True)
    
    # Merge the new dataframe with the input dataframe and return
    return pd.merge(df, compounds_df, on=molecule_id_column, how='outer')

# Run time approx. 45 min
ChEMBL_df = get_chembl_cmpd_infos(ChEMBL_df, 'ChEMBL_Molecule_ID') 
ChEMBL_df.head(1)

## Filter for molecules of type 'Small molecule' or 'Unknown'

In [None]:
# Print, which types of molecules are present
ChEMBL_df['Molecule_Type'].unique()

# Filter for molecules of type 'Small molecule' or 'Unknown' 
ChEMBL_df = ChEMBL_df[ChEMBL_df['Molecule_Type'].isin(['Small molecule', 'Unknown'])]

In [None]:
ChEMBL_df.to_excel(os.path.join(filepath, 'ChEMBL_AllKinases_Molecule_Data.xlsx'), index=False)

# Group by Molecule

In [None]:
# Group the dataframe by the ChEMBL Molecule ID 
# Aggregate the values in all other columns to list
ChEMBL_df = ChEMBL_df.groupby(['ChEMBL_Molecule_ID']).agg({
  'ChEMBL_Molecule_ID': 'first', 
  'SMILES'            : 'first', 
  'Molecule_Type'     : 'first',
  'Molecule_Name'     : 'first',
  **{col: list for col in ChEMBL_df.columns.to_list() if col not in ['ChEMBL_Molecule_ID', 'SMILES', 'Molecule_Type', 'Molecule_Name']} 
  }).reset_index(drop=True)

# Count how many unique ChEMBL Target IDs are reported per molecule. Sort descending by the Target_Count
ChEMBL_df['Target_Count'] = ChEMBL_df['ChEMBL_Target_ID'].apply(lambda x: len(set(x)))
ChEMBL_df = ChEMBL_df.sort_values('Target_Count', ascending=False).reset_index(drop=True)

print(len(ChEMBL_df))
ChEMBL_df.head(1)

In [None]:
try: 
    PandasTools.AddMoleculeColumnToFrame(ChEMBL_df, smilesCol="SMILES", molCol='2D_Mol')
except: 
    ChEMBL_df['2D_Mol'] = None

# Drop rows, for which no molecule could be generated 
ChEMBL_df = ChEMBL_df[ChEMBL_df['2D_Mol'] != None]
ChEMBL_df = ChEMBL_df.reset_index(drop=True)

In [None]:
ChEMBL_df.copy().drop(['2D_Mol'], axis=1).to_excel(os.path.join(filepath, 'ChEMBL_AllKinases_Molecule_Data_Grouped.xlsx'), index=False)

print(f"Number of bioactive molecules (before molecular standardization) = {len(ChEMBL_df)} as of {date}")

In [None]:
# Show molecules that have no type assigned
print(ChEMBL_df['Molecule_Type'].value_counts())
ChEMBL_df[ChEMBL_df['Molecule_Type'] == 'Unknown']

# Distributions and Value Counts

## Number of different kinase targets reported for a cmpd

In [None]:
ChEMBL_df['Target_Count'].plot(kind='hist', logy=True, bins=270)
plt.ylabel('Number of different Kinase Targets')
plt.xlabel('Number of Compounds')
plt.show
plt.savefig(f'Analysis/ChEMBL_Kinases/Histogram_Number_of_Kinase_Targets_per_Compound.png')

## pIC50 value distribution

In [None]:
# # Extract the maximal pChEMBL value reported for a compounds, towards any kinase
# ChEMBL_df['Max_pChEMBL_Value'] = ChEMBL_df['pChEMBL_Value'].apply(lambda x: max(x))

# # Plot pIC50 value distribution 
# plt.hist(ChEMBL_df['Max_pChEMBL_Value'], bins=50, edgecolor='black')
# plt.xlabel('Max pChEMBL value for the compound across all kinase targets.')
# plt.ylabel('Frequency')
# plt.yticks(np.arange(plt.ylim()[0], plt.ylim()[1], 1))
# plt.savefig("Analysis/ChEMBL_Kinases/Histogram_pChEMBL_Value_Distribution.png")

## LE distribution

In [None]:
# # Extract the maximal Ligand Efficiency value reported for a compounds, towards any kinase
# ChEMBL_df['Max_Ligand_Efficiency'] = ChEMBL_df['Ligand_Efficiency'].apply(lambda x: max(x))

# # Plot Ligand Efficiency value distribution 
# plt.hist(ChEMBL_df['Max_Ligand_Efficiency'], bins=50, edgecolor='black')
# plt.xlabel('Max Ligand Efficiency for the compound across all kinase targets.')
# plt.ylabel('Frequency')
# plt.yticks(np.arange(plt.ylim()[0], plt.ylim()[1], 1))
# plt.savefig("Analysis/ChEMBL_Kinases/Histogram_Ligand_Efficiency_Value_Distribution.png")