In [1]:
import pandas as pd
import numpy as np
from scipy.stats import rankdata

# ========= 1. Read EGFR file =========

raw_path = "3.1.1 raw/egfr_inhibitors_raw.csv"

df_raw = pd.read_csv(raw_path, sep=";", engine="python")

print("Orginal data shape：", df_raw.shape)
print("Columns：", df_raw.columns.tolist())

Orginal data shape： (25758, 48)
Columns： ['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase', 'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key', 'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value', 'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment', 'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE', 'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate', 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID', 'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID', 'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction', 'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation', 'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type', 'Document ChEMBL ID', 'Source ID', 'Source Description', 'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties', 'Action Type', 'Standard Text Value', 'Value']


In [2]:
# ========= 2. Basic filteration: Keep Data with IC50, Unit nM, SMILES, pChEMBL =========

df = df_raw.copy()

# Only Keep: Standard Type = IC50
mask_ic50 = df["Standard Type"].str.upper() == "IC50"

# Only Keep: Standard Unit = nM
mask_nM = df["Standard Units"].str.lower() == "nm"

# Only Keep: pCHEMBL value
mask_pch = df["pChEMBL Value"].notna()

# Only Keep: SMILES
mask_smiles = df["Smiles"].notna()

df_f = df[mask_ic50 & mask_nM & mask_pch & mask_smiles].copy()

print("Filtered Shape：", df_f.shape)

Filtered Shape： (18633, 48)


In [3]:
# ========= 3. Convert pChEMBL to Numeric, Compute IC50 (µM) =========
# One SMILES may appear multiple times in ChEMBL
# from different assays or labs.
# We follow common medicinal chemistry practice:
# → use the MAX pChEMBL (strongest potency).
# Turn numeric

df_f["pchembl"] = pd.to_numeric(df_f["pChEMBL Value"], errors="coerce")

# pChEMBL → IC50 (µM)
# p = -log10(IC50[M]) → IC50[M] = 10^(-p) → * 1e6 = µM
df_f["activity_uM"] = 10 ** (-df_f["pchembl"]) * 1e6

# ========= 4. Aggregate duplicated molecules (same SMILES) =========

agg = df_f.groupby("Smiles").agg(
    pchembl=("pchembl", "max"),        # largest pChEMBL = strongest
).reset_index()

# Recompute IC50 (μM) after aggregation
agg["activity_uM"] = 10 ** (-agg["pchembl"]) * 1e6

print("no. of unique SMILES：", agg.shape[0])

no. of unique SMILES： 10466


In [4]:
# ========= 5. Generate binary activity using threshold = 100 nM =========
# This follows kinase / EGFR medicinal chemistry standards:
# IC50 ≤ 100 nM (0.1 μM) → ACTIVE
threshold_uM = 0.1  # 100 nM
agg["bin_activity"] = (agg["activity_uM"] <= threshold_uM).astype(int)

# ========= 6. Generate normalized activity (rank-based) =========
# - Stronger activity (smaller IC50) gets higher rank
# - Range is 0 → 1

y = np.array(agg["activity_uM"])
r = rankdata(-y, method="ordinal") / len(y)
agg["norm_activity"] = r