# Data Loading and Initial Processing
Load the CSV data using pandas and perform initial cleaning steps. Handle missing values and standardize column names.

In [2]:
import pandas as pd

# Load the CSV data
data = pd.read_csv('data/BindingDB_col_cleaned.tsv', delimiter='\t')

# Display the first few rows of the dataframe
data.head()

# Standardize column names
data.columns = [col.strip().replace(' ', '_').replace('/', '_') for col in data.columns]

# Handle missing values by filling them with NaN
data = data.fillna(pd.NA)

# Display the cleaned dataframe
data.head()

Unnamed: 0,Ligand_SMILES,BindingDB_Ligand_Name,Target_Name,Target_Source_Organism_According_to_Curator_or_DataSource,Ki_(nM),IC50_(nM),Kd_(nM),EC50_(nM),kon_(M-1-s-1),koff_(s-1),...,Number_of_Protein_Chains_in_Target_(>1_implies_a_multichain_complex),BindingDB_Target_Chain_Sequence,UniProt_(SwissProt)_Entry_Name_of_Target_Chain,UniProt_(TrEMBL)_Entry_Name_of_Target_Chain,BindingDB_Target_Chain_Sequence.1,UniProt_(SwissProt)_Entry_Name_of_Target_Chain.1,UniProt_(TrEMBL)_Entry_Name_of_Target_Chain.1,BindingDB_Target_Chain_Sequence.2,UniProt_(SwissProt)_Entry_Name_of_Target_Chain.2,UniProt_(TrEMBL)_Entry_Name_of_Target_Chain.2
0,COc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1,3-((4-Methoxyphenyl)-amino)-4-((3-chlorophenyl...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,5800,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
1,Oc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1,3-((4-Hydroxyphenyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,>1000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
2,COc1cccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1,3-((3-Methoxyphenyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,4300,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3,Oc1cccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1,3-((3-Hydroxyphenyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,3000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
4,CC(C)(C)OC(=O)Nc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl...,3-((4-(N-BOC-amino)phenyl)amino)-4-((3-chlorop...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,57000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,


# Extract CDK-Related Entries
Filter the dataset to keep only CDK-related entries. Create a subset of 47 CDKs and their binding ligands.

In [3]:
import pandas as pd

# Filter the dataset to keep only CDK-related entries
cdk_data = data[data['Target_Name'].str.contains('Cyclin-dependent kinase', na=False)]

# Create a subset of 47 CDKs and their binding ligands
cdk_subset = cdk_data.head(47)

# Display the subset dataframe
cdk_subset.head()

Unnamed: 0,Ligand_SMILES,BindingDB_Ligand_Name,Target_Name,Target_Source_Organism_According_to_Curator_or_DataSource,Ki_(nM),IC50_(nM),Kd_(nM),EC50_(nM),kon_(M-1-s-1),koff_(s-1),...,Number_of_Protein_Chains_in_Target_(>1_implies_a_multichain_complex),BindingDB_Target_Chain_Sequence,UniProt_(SwissProt)_Entry_Name_of_Target_Chain,UniProt_(TrEMBL)_Entry_Name_of_Target_Chain,BindingDB_Target_Chain_Sequence.1,UniProt_(SwissProt)_Entry_Name_of_Target_Chain.1,UniProt_(TrEMBL)_Entry_Name_of_Target_Chain.1,BindingDB_Target_Chain_Sequence.2,UniProt_(SwissProt)_Entry_Name_of_Target_Chain.2,UniProt_(TrEMBL)_Entry_Name_of_Target_Chain.2
0,COc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1,3-((4-Methoxyphenyl)-amino)-4-((3-chlorophenyl...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,5800,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
1,Oc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1,3-((4-Hydroxyphenyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,>1000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
2,COc1cccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1,3-((3-Methoxyphenyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,4300,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
3,Oc1cccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1,3-((3-Hydroxyphenyl)amino)-4-((3-chlorophenyl)...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,3000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,
4,CC(C)(C)OC(=O)Nc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl...,3-((4-(N-BOC-amino)phenyl)amino)-4-((3-chlorop...,Cyclin-dependent kinase/G2/mitotic-specific cy...,Homo sapiens,,57000,,,,,...,2,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,CDK1_HUMAN,,MALRVTRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALG...,CCNB1_HUMAN,,,,


# Ligand Analysis and Processing
Extract unique ligands from the dataset. Process SMILES strings and create a list of unique binding ligands.

In [4]:
import pandas as pd

# Extract unique ligands from the dataset
unique_ligands = cdk_subset['Ligand_SMILES'].unique()

# Process SMILES strings and create a list of unique binding ligands
unique_ligands_list = list(unique_ligands)

# Display the number of unique ligands
print(f"Number of unique ligands: {len(unique_ligands_list)}")

# Display the first few unique ligands
unique_ligands_list[:5]

Number of unique ligands: 33


['COc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1',
 'Oc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1',
 'COc1cccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1',
 'Oc1cccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)c1',
 'CC(C)(C)OC(=O)Nc1ccc(Nc2[nH]nc3ncnc(Nc4cccc(Cl)c4)c23)cc1']

# Binding Affinity Dataframes Creation
Create separate dataframes for Ki, IC50, Kd, and EC50 measurements. Handle missing values and normalize the data.

In [5]:
import pandas as pd
import numpy as np

# Create separate dataframes for each binding affinity measure
ki_df = cdk_subset.pivot(index='Target_Name', columns='Ligand_SMILES', values='Ki_(nM)')
ic50_df = cdk_subset.pivot(index='Target_Name', columns='Ligand_SMILES', values='IC50_(nM)')
kd_df = cdk_subset.pivot(index='Target_Name', columns='Ligand_SMILES', values='Kd_(nM)')
ec50_df = cdk_subset.pivot(index='Target_Name', columns='Ligand_SMILES', values='EC50_(nM)')

# Handle missing values by filling them with NaN
ki_df = ki_df.fillna(np.nan)
ic50_df = ic50_df.fillna(np.nan)
kd_df = kd_df.fillna(np.nan)
ec50_df = ec50_df.fillna(np.nan)

# Normalize the data (e.g., log transformation)
ki_df = ki_df.applymap(lambda x: np.log10(x) if pd.notnull(x) else x)
ic50_df = ic50_df.applymap(lambda x: np.log10(x) if pd.notnull(x) else x)
kd_df = kd_df.applymap(lambda x: np.log10(x) if pd.notnull(x) else x)
ec50_df = ec50_df.applymap(lambda x: np.log10(x) if pd.notnull(x) else x)

# Display the created dataframes
ki_df.head()
ic50_df.head()
kd_df.head()
ec50_df.head()

ValueError: Index contains duplicate entries, cannot reshape

# Affinity Score Classification
Implement classification logic for binding affinities (weak, moderate, strong) for each measurement type. Create labeled datasets.

In [None]:
import pandas as pd
import numpy as np

# Define thresholds for classification
ki_thresholds = {'weak': 10000, 'moderate': 1000, 'strong': 100}
ic50_thresholds = {'weak': 10000, 'moderate': 1000, 'strong': 100}
kd_thresholds = {'weak': 10000, 'moderate': 1000, 'strong': 100}
ec50_thresholds = {'weak': 10000, 'moderate': 1000, 'strong': 100}

# Function to classify affinity scores
def classify_affinity(value, thresholds):
    if pd.isna(value):
        return 'unknown'
    elif value > thresholds['weak']:
        return 'weak'
    elif value > thresholds['moderate']:
        return 'moderate'
    else:
        return 'strong'

# Apply classification to each dataframe
ki_labels = ki_df.applymap(lambda x: classify_affinity(x, ki_thresholds))
ic50_labels = ic50_df.applymap(lambda x: classify_affinity(x, ic50_thresholds))
kd_labels = kd_df.applymap(lambda x: classify_affinity(x, kd_thresholds))
ec50_labels = ec50_df.applymap(lambda x: classify_affinity(x, ec50_thresholds))

# Combine all labels into a single dataframe
combined_labels = pd.concat([ki_labels, ic50_labels, kd_labels, ec50_labels], keys=['Ki', 'IC50', 'Kd', 'EC50'])

# Display the labeled dataframes
ki_labels.head()
ic50_labels.head()
kd_labels.head()
ec50_labels.head()
combined_labels.head()

# Create Combined Dataset
Merge all information into a single comprehensive dataframe with ligand-CDK pairs and their corresponding affinity measures and labels.

In [None]:
import pandas as pd

# Combine all affinity measures into a single dataframe
combined_affinities = pd.concat([ki_df, ic50_df, kd_df, ec50_df], keys=['Ki', 'IC50', 'Kd', 'EC50'])

# Reset index to have a flat dataframe
combined_affinities = combined_affinities.reset_index()

# Rename columns for clarity
combined_affinities.columns = ['Measure', 'Target_Name', 'Ligand_SMILES', 'Affinity']

# Merge the combined affinities with the combined labels
combined_dataset = pd.merge(combined_affinities, combined_labels.reset_index(), on=['Measure', 'Target_Name', 'Ligand_SMILES'])

# Rename columns for clarity
combined_dataset.columns = ['Measure', 'Target_Name', 'Ligand_SMILES', 'Affinity', 'Label']

# Display the combined dataset
combined_dataset.head()

# Clustering Analysis
Implement clustering methods to group similar ligands. Use chemical similarity metrics and molecular fingerprints.

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Function to compute molecular fingerprints
def compute_fingerprints(smiles_list):
    fingerprints = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
            fingerprints.append(fp)
    return fingerprints

# Compute fingerprints for unique ligands
fingerprints = compute_fingerprints(unique_ligands_list)

# Convert fingerprints to numpy array
fingerprint_array = np.array([list(fp) for fp in fingerprints])

# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)
pca_result = pca.fit_transform(fingerprint_array)

# Perform KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(fingerprint_array)

# Create a dataframe for PCA results and clusters
pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
pca_df['Cluster'] = clusters
pca_df['Ligand_SMILES'] = unique_ligands_list

# Plot the PCA results with clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=pca_df, palette='viridis')
plt.title('PCA of Ligand Fingerprints with KMeans Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()

# Display the PCA dataframe with clusters
pca_df.head()

# Visualization and Results
Create visualizations of the clusters, affinity distributions, and key findings using matplotlib and seaborn.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the distribution of affinity measures
plt.figure(figsize=(12, 6))
sns.histplot(data=combined_dataset, x='Affinity', hue='Measure', multiple='stack', bins=50, palette='Set2')
plt.title('Distribution of Affinity Measures')
plt.xlabel('Log Affinity (nM)')
plt.ylabel('Count')
plt.legend(title='Measure')
plt.show()

# Plot the distribution of affinity labels
plt.figure(figsize=(12, 6))
sns.countplot(data=combined_dataset, x='Label', hue='Measure', palette='Set2')
plt.title('Distribution of Affinity Labels')
plt.xlabel('Affinity Label')
plt.ylabel('Count')
plt.legend(title='Measure')
plt.show()

# Plot the PCA results with clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=pca_df, palette='viridis')
plt.title('PCA of Ligand Fingerprints with KMeans Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()

# Display the PCA dataframe with clusters
pca_df.head()