In [None]:
# General python libraries and packages
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import re 
import datetime
date = datetime.date.today()
import os

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Processing chemical data
from rdkit import Chem
from rdkit.Chem import PandasTools, Draw, Descriptors

# Dataset Generation

Subset the dataset of all kinases for PKA 

In [None]:
ec_number= '2.7.11.11'

In [None]:
# Per definition, the molecule annotated for PKA is a subset of the ones annotated for all protein kinases! 
# Thus, we filter can simply filter the latter!
df = pd.read_excel(os.path.join('Data', os.path.join('ChEMBL_Kinases', "ChEMBL_AllKinases_Molecule_Data.xlsx"))) 

# Filter for molecules where the string in the EC_Number column comprises the substring "2.7.11.11"
# The na=False parameter ensures that NaN values are treated as False, i.e., they will not be included
df = df[df['Target_EC_Number'].str.contains(ec_number, na=False)]
df = df.reset_index(drop=True)

df.head(1)

Alternatively, if no dataset for the entire protein family is present (that can be subsetted)
<br> Retrieve the target information from ChEMBL directly (via API), using the ec_number for filtering

In [None]:
# from chembl_webresource_client.new_client import new_client

# def get_targets_by_ec_number(ec_number):
#     targets_api = new_client.target
#     targets = targets_api.filter(target_synonym__icontains=ec_number, target_type="SINGLE PROTEIN").only(
#         "organism", "pref_name", "target_chembl_id")
#     df = pd.DataFrame.from_records(targets)
#     df = df.rename(columns = {'organism'        : 'Target_Organism', 
#                             'pref_name'         : 'Target_Name', 
#                             'target_chembl_id'  : 'ChEMBL_Target_ID',
#                             })
#     print(f"Number of single protein targets in the ChEMBL with EC Number {ec_number} = {len(df)}")
#     return df

# df = get_targets_by_ec_number(ec_number='2.7.11.11')
# df

# # Next, conduct all the steps, from the '1_DataRetrival_ChEMBL_All_Kinases.ipynb' Notebook!

Filter for the alpha-isoform of the catalytic subunit only 

In [None]:
# Extract the subunit of PKA, namely catalytic/regulatory, using a case-insensitive pattern search
df['Target_Protein_Subunit'] = df['Target_Name'].str.extract(r'(catalytic|regulatory)', flags=re.IGNORECASE)

# Extract the isoform of the catalytic subunit of PKA, namely alpha/beta/gamma, using a case-insensitive pattern search
df['Target_Protein_Isoform'] = df['Target_Name'].str.extract(r'(alpha|beta|gamma)', flags=re.IGNORECASE)

# Filter for the alpha-isoform of the catalytic subunit only
df = df[(df['Target_Protein_Isoform'] == 'alpha') & (df['Target_Protein_Subunit'] == 'catalytic')]
df = df.reset_index(drop = True)

# Drop helper columns and other invariant columns
df = df.drop(columns=['Target_Protein_Isoform', 'Target_Protein_Subunit', 'Target_EC_Number', #'Target_Name',
                      'Target_Protein_Class_Name', 'Kinase_Group', 'Kinase_Family', 'Kinase_Subfamily', ])

# Save to file
PKA_path = os.path.join('Data', 'ChEMBL_PKA')
df.to_excel(os.path.join(PKA_path, 'ChEMBL_PKA_Molecule_Data.xlsx'), index=False)

# Print some "statistics"
print(f"Number of different target proteins, after filtering for the alpha-isoform of the catalytic subunit only = {len(df['ChEMBL_Target_ID'].unique())}")
print(f"Target Names = {df['Target_Name'].unique()}")
print(f"from organisms = {df['Target_Organism'].unique()}. \n ")
print(f"Number of bioactive molecules before molecule standardization and deduplication = {len(df['ChEMBL_Molecule_ID'].unique())}")

df.head(1)

# Group by Molecule

In [None]:
# Group the dataframe by the ChEMBL Molecule ID
df = df.groupby(['ChEMBL_Molecule_ID']).agg({
  'ChEMBL_Molecule_ID'  : 'first', 
  'SMILES'              : 'first', 
  'Molecule_Name'       : 'first',
  'Molecule_Type'       : 'first', 
  'Target_Name'         : 'first', 
  **{col: list for col in df.columns.to_list() if col not in ['ChEMBL_Molecule_ID', 'SMILES', 'Molecule_Name', 'Molecule_Type']} 
  }).reset_index(drop=True)

print(len(df))

# Save to file
df.to_excel(os.path.join(PKA_path, "ChEMBL_PKA_Molecule_Data_Grouped.xlsx"), index=False)

In [None]:
# Add molecules to dataframe
PandasTools.AddMoleculeColumnToFrame(df, smilesCol="SMILES", molCol='Structure')
PandasTools.RenderImagesInAllDataFrames(images=True)

df.head(5)

# Plot some distributions

In [None]:
# Extract the maximal pCheMBL value reported for a given compound
df['Max_pChEMBL_Value'] = df['pChEMBL_Value'].apply(lambda x: max(x))

# Plot pIC50 value distribution 
plt.hist(df['Max_pChEMBL_Value'], bins=71, edgecolor='black')
plt.xlabel('Max pChEMBL value for the compound towards PKA')
plt.ylabel('Frequency')
plt.yticks(np.arange(plt.ylim()[0], plt.ylim()[1], 1))
# plt.savefig("Analysis/ChEMBL_PKA/Histogram_pChEMBL_Value_Distribution.png")
plt.show()

In [None]:
# Extract the maximal LE value reported for a given compound
df['Max_Ligand_Efficiency'] = df['Ligand_Efficiency'].apply(lambda x: max(x))

# Plot Ligand Efficiency value distribution 
plt.hist(df['Max_Ligand_Efficiency'], bins=71, edgecolor='black')
plt.xlabel('Max Ligand Efficiency for the compound towards PKA.')
plt.ylabel('Frequency')
plt.yticks(np.arange(plt.ylim()[0], plt.ylim()[1], 1))
# plt.savefig("Analysis/ChEMBL_PKA/Histogram_Ligand_Efficiency_Value_Distribution.png")
plt.show()