In [53]:
import pandas as pd
import os

In [54]:
DATA_PATH = '../../data/'
DATA = os.path.join(DATA_PATH, 'BindingDB_All.tsv')

## Let's look for a suitable protein family to study

### Step 1. Match targets to their families. We made a choice to use PANTHER classification, although other classifications exist too.

In [55]:
df = pd.read_csv(DATA, sep="\t", on_bad_lines='skip', 
                 usecols=["BindingDB Reactant_set_id", 
                          "Ki (nM)", "IC50 (nM)", 
                          "UniProt (SwissProt) Entry Name of Target Chain"])

  df = pd.read_csv(DATA, sep="\t", on_bad_lines='skip',


We would need binding affinities to do downstream analyses, so let's take care of it now.

In [56]:
df = df.dropna(subset=["Ki (nM)"])

In [57]:
import sys
sys.path.append('../utils')
from retrieve_family import retrieve_family

uniprot_ids = df["UniProt (SwissProt) Entry Name of Target Chain"].unique()
family_dict = await retrieve_family(uniprot_ids)

# Set of unique protein families
families = set()
for key in family_dict:
    families.update(family_dict[key])

# Dictionary with the protein families as keys and the UniProt names as values
family_dict_2 = {family: [] for family in families}
for key in family_dict:
    for family in family_dict[key]:
        if key not in family_dict_2[family]:
            family_dict_2[family].append(key)

print(f"We were able to retrieve {len(families)} unique protein families")

100%|██████████| 2956/2956 [00:04<00:00, 597.32it/s] 

We were able to retrieve 3810 unique protein families





### Step 2. Now that each target is mapped to its families, let's select families with adequate number of targets.

In [58]:
import collections

# Count the number of proteins in each family, don't count families with less than n proteins
n = 5
counter = collections.Counter({k: len(v) for k, v in family_dict_2.items() if len(v) >= n})
counter.most_common(10)

[('UNCHARACTERIZED', 171),
 ('ADRENERGIC RECEPTOR-RELATED G-PROTEIN COUPLED RECEPTOR', 114),
 ('TYROSINE-PROTEIN KINASE RECEPTOR', 84),
 ('SERINE/THREONINE-PROTEIN KINASE', 81),
 ('G-PROTEIN COUPLED RECEPTOR', 81),
 ('5-HYDROXYTRYPTAMINE RECEPTOR', 80),
 ('-', 76),
 ('RIBOSOMAL PROTEIN S6 KINASE', 74),
 ('TYROSINE-PROTEIN KINASE', 64),
 ('NUCLEAR HORMONE RECEPTOR', 64)]

Let's filter out some families.

In [59]:
# Drop if family is "-" or "UNCHARACTERIZED"
del counter["-"]
del counter["UNCHARACTERIZED"]
# Only keep families if they are in the counter
family_dict_2 = {k: v for k, v in family_dict_2.items() if k in counter.keys()}
# Only keep proteins if they are in family_dict_2
ids = set()
for key in family_dict_2:
    ids.update(family_dict_2[key])
family_dict = {k: v for k, v in family_dict.items() if k in ids}

In [60]:
df_filtered = df[df['UniProt (SwissProt) Entry Name of Target Chain'].isin(family_dict.keys())]

### Step 3. We would need tagets with enough ligands.
Let's count ligands in each family.

In [61]:
summary_families = pd.DataFrame.from_dict(counter, orient='index', columns=['Number of targets'])
grouped = df_filtered.groupby('UniProt (SwissProt) Entry Name of Target Chain').count()["Ki (nM)"]
# Discard counts lower than threshold
# threshold = 1000
# grouped = grouped[grouped > threshold]
ligands_count = collections.Counter()
for target in family_dict.keys():
    for family in family_dict[target]:
        try:
            ligands_count[family] += grouped[target]
        except KeyError:
            pass
summary_families["Number of ligands"] = [ligands_count[family] for family in summary_families.index]

In [62]:
summary_families["Average ligands per target"] = summary_families["Number of ligands"] / summary_families["Number of targets"]
summary_families["Average ligands per target"] = summary_families["Average ligands per target"].round(2)
summary_families = summary_families.sort_values(by="Average ligands per target", ascending=False)

In [63]:
summary_families.head(15)

Unnamed: 0,Number of targets,Number of ligands,Average ligands per target
CB1 CANNABINOID RECEPTOR-INTERACTING PROTEIN 1,8,63768,7971.0
HISTAMINE RECEPTOR-RELATED G-PROTEIN COUPLED RECEPTOR,5,38575,7715.0
TYROSINE-PROTEIN KINASE HOPSCOTCH,9,38206,4245.11
"FAM11A, B PROTEIN",5,19805,3961.0
INTERFERON/INTERLEUKIN RECEPTOR,7,22539,3219.86
PERIPHERAL-TYPE BENZODIAZEPINE RECEPTOR,5,15866,3173.2
REGULATOR OF G PROTEIN SIGNALING,10,31690,3169.0
PITUITARY HOMEOBOX HOMOLOG PTX1,6,18285,3047.5
CHLORIDE INTRACELLULAR CHANNEL PROTEIN 6-RELATED,13,36930,2840.77
PROTEIN BHLHB9-RELATED,8,22543,2817.88


### Step 4. Zoom on suitable candidates.
Let's look at our winners! At this stage, we used a combination of data balance considerations, personal preferences and biological knowledge to select a family with a manageable amount of targets and a considerable amount of ligands per target.

In [64]:
for target in family_dict_2["TYROSINE-PROTEIN KINASE HOPSCOTCH"]:
    if target in grouped:
        print(f"{target} has {grouped[target]} ligands")

CCR5_MOUSE has 57 ligands
CCR5_HUMAN has 219 ligands
PDE4A_HUMAN has 104 ligands
MERTK_HUMAN has 12 ligands
TEC_HUMAN has 2 ligands
JAK2_HUMAN has 5039 ligands
JAK3_HUMAN has 1993 ligands
TYK2_HUMAN has 2505 ligands
JAK1_HUMAN has 4908 ligands
