In [13]:
import requests
import json
from tqdm import tqdm
import pandas as pd
import numpy as np

In [14]:
DATA_PATH = './src/data/'
DATA = DATA_PATH + 'BindingDB_sample.tsv'

## Let's look for a suitable protein family to study

Step 1. Match targets to their families. We made a choice to use PANTHER classification, although other classifications exist too.

In [15]:
df = pd.read_csv(DATA, sep="\t", on_bad_lines='skip', dtype={
    "BindingDB Reactant_set_id": np.int32,
})

In [None]:
import sys
sys.path.append('./src/utils')
from retrieve_family import retrieve_family

uniprot_ids = df["UniProt (SwissProt) Entry Name of Target Chain"].unique()
# Retrieve family information for the test ID
try:
    family_dict = retrieve_family(uniprot_ids)
    # Save results to a file
    with open("protein_families.json", "w") as f:
        json.dump(family_dict, f, indent=2)
    print("Protein families saved to 'protein_families.json'")
except Exception as e:
    print(f"Error: {e}")

ModuleNotFoundError: No module named 'retrieve_family'

In [None]:
df_family = pd.DataFrame.from_dict(family_dict, orient='index', columns=['Family'])
# Number of unique protein families
print(f"There are {df_family['Family'].nunique()} unique families in this dataset")
# Number of targets within each family
df_family['Family'].value_counts()

There are 343 unique families in this dataset


"Family\n5-HYDROXYTRYPTAMINE RECEPTOR                                                                             14\nNEUROPEPTIDES RECEPTOR                                                                                   11\nADRENERGIC RECEPTOR-RELATED G-PROTEIN COUPLED RECEPTOR                                                    9\nCARBONIC ANHYDRASE                                                                                        9\nNEUROTRANSMITTER GATED ION CHANNEL                                                                        8\nCYCLIC NUCLEOTIDE PHOSPHODIESTERASE                                                                       8\n-                                                                                                         7\nOLFACTORY RECEPTOR AND ADENOSINE RECEPTOR                                                                 7\nSODIUM/CHLORIDE DEPENDENT TRANSPORTER                                                                     6\nPHOSPHATID

In [None]:
# Only keep families with at least n occurences
n = 5
df_family = df_family.groupby('Family').filter(lambda x: len(x) >= n)
# Drop NaN values
df_family = df_family.dropna()

In [38]:
# In the original dataframe, only keep the rows with UniProt IDs that have a family
df = df[df['UniProt (SwissProt) Entry Name of Target Chain'].isin(df_family.index)]

In [41]:
# Value counts of the target UniProt IDs
df.groupby('UniProt (SwissProt) Entry Name of Target Chain').size().sort_values(ascending=False)

UniProt (SwissProt) Entry Name of Target Chain
VGFR2_HUMAN    8
CAH2_HUMAN     7
CAH9_HUMAN     6
PK3CD_HUMAN    6
DRD3_HUMAN     6
              ..
SSR5_HUMAN     1
UBP19_HUMAN    1
UBP47_HUMAN    1
UBP7_HUMAN     1
UFO_HUMAN      1
Length: 123, dtype: int64

In [None]:
# 

Unnamed: 0,BindingDB Reactant_set_id,Ligand SMILES,Ligand InChI,Ligand InChI Key,BindingDB MonomerID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,UniProt (SwissProt) Recommended Name of Target Chain.12,UniProt (SwissProt) Entry Name of Target Chain.12,UniProt (SwissProt) Primary ID of Target Chain.12,UniProt (SwissProt) Secondary ID(s) of Target Chain.12,UniProt (SwissProt) Alternative ID(s) of Target Chain.12,UniProt (TrEMBL) Submitted Name of Target Chain.12,UniProt (TrEMBL) Entry Name of Target Chain.12,UniProt (TrEMBL) Primary ID of Target Chain.12,UniProt (TrEMBL) Secondary ID(s) of Target Chain.12,UniProt (TrEMBL) Alternative ID(s) of Target Chain.12
1,50411131,NC1CCN(Cc2ccn3ncnc(Oc4ccc(NC(=O)NC(=O)Cc5ccc(F...,InChI=1S/C27H27F2N7O3/c28-19-3-1-17(2-4-19)13-...,YQQFRBUHZZNTGY-UHFFFAOYSA-N,50235544,1-(4-(5-((4-aminopiperidin-1-yl)methyl)pyrrolo...,MAP kinase-activated protein kinase 2,Homo sapiens,,>5000,...,,,,,,,,,,
4,50138447,NS(=O)(=O)c1ccc(c(COc2ccc(cc2)-c2nc3cc(ccc3n2C...,InChI=1S/C33H30ClN3O5S/c34-25-11-6-21(7-12-25)...,PEORIWUYJQYMKR-UHFFFAOYSA-N,50191532,2-[4-(4'-chloro-4-sulfamoylbiphenyl-2-ylmethox...,RNA-directed RNA polymerase,Hepatitis C virus,,16,...,,,,,,,,,,
5,50810600,NS(=O)(=O)c1nnc(NC(=O)CN(CCN(CC(O)=O)c2ccccc2O...,"InChI=1S/C20H22N6O7S2/c21-35(32,33)20-24-23-19...",GRMCPBPSVHZQFB-UHFFFAOYSA-N,50292079,CHEMBL284071::[(2-Hydroxy-phenyl)-(2-{(2-hydro...,Carbonic anhydrase 4,Bos taurus,105.0,,...,,,,,,,,,,
12,50659540,O=C1c2ccccc2-c2n[nH]c3cccc1c23,InChI=1S/C14H8N2O/c17-14-9-5-2-1-4-8(9)13-12-1...,ACPOUJIDANTYHO-UHFFFAOYSA-N,16018,"14,15-diazatetracyclo[7.6.1.0^{2,7}.0^{13,16}]...",Mitogen-activated protein kinase 8,Homo sapiens,,14000,...,,,,,,,,,,
13,1141475,FC(F)(F)S(=O)(=O)Nc1ccc(cc1)-c1cccc(c1)N(CC12C...,"InChI=1S/C33H34F4N4O4S/c34-32-17-31(18-32,19-3...",ASGHGBJDGBIASZ-UHFFFAOYSA-N,538359,"US11254663, Example 267",Bile acid receptor,Homo sapiens,,,...,,,,,,,,,,
