# ChEMBL molecules from UniProt IDs

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd

from chembl_utils import targets_by_uniprot_ids, bioactivities_by_target_chembl_ids, standardize_bioactivities, molecules_by_molecule_chembl_ids

In [3]:
PROBIS_FOLDER = Path('.') / '..' / 'data' / 'probis' / 'probis_pocket_15_0.5'
UNIPROT_IDS_PATH = PROBIS_FOLDER / 'simProtTable_6lu7A_uniprot_ids.csv'

PIC50_CUTOFF = 6.3  # Minimum pIC50 value considered to describe an active molecule

## Aim of this notebook

Get molecules that are tested as "active" against targets with given UniProt IDs. Bioactivity values considered as "active" are defined in `PIC50_CUTOFF`.

1. Get UniProt IDs
2. Get ChEMBL targets by UniProt IDs
3 Filter ChEMBL targets 
  - Only single proteins
4. Get ChEMBL bioactivities by target ChEMBL IDs
  - Only IC50, assay type B, and exact measurements
5. Filter ChEMBL bioactivities by certain criteria 
  - Only best pIC50 if molecule measured multiple times
  - Only "active" molecules with pIC50 equal or greater than pIC50 cutoff
6. Get ChMEBL molecules by molecule ChEMBL IDs (e.g. SMILES)
7. Combine and save molecule and bioactivity data

## 1. Get UniProt IDs

In [4]:
uniprot_ids = pd.read_csv(UNIPROT_IDS_PATH, header=None)[0].to_list()
print(f'Number of UniProt IDs: {len(uniprot_ids)}')

Number of UniProt IDs: 62


## 2. Get ChEMBL targets

Get target data from ChEMBL that are linked to UniProt IDs.

In [5]:
targets = targets_by_uniprot_ids(uniprot_ids)
targets.shape

(27, 5)

## 3. Filter ChEMBL targets

- Use only single proteins (complexes and families might be too unspecific for compound search later)
- More?

In [6]:
targets.groupby('target_type').size()

target_type
PROTEIN COMPLEX                 9
PROTEIN FAMILY                  5
PROTEIN-PROTEIN INTERACTION     1
SINGLE PROTEIN                 12
dtype: int64

In [7]:
targets = targets[targets.target_type == 'SINGLE PROTEIN']
targets.shape

(12, 5)

In [8]:
targets.sort_values('target_chembl_id')

Unnamed: 0,organism,pref_name,target_chembl_id,target_type,uniprot_id
2,Homo sapiens,Retinoid X receptor alpha,CHEMBL2061,SINGLE PROTEIN,P19793
15,Homo sapiens,Tyrosine-protein kinase ITK/TSK,CHEMBL2959,SINGLE PROTEIN,Q08881
1,Human coronavirus NL63,Replicase polyprotein 1a,CHEMBL3232683,SINGLE PROTEIN,P0C6U6
47,Homo sapiens,Acetyl-CoA carboxylase 1,CHEMBL3351,SINGLE PROTEIN,Q13085
28,Homo sapiens,"POU domain, class 2, transcription factor 2",CHEMBL3509582,SINGLE PROTEIN,P09086
30,Homo sapiens,Programmed cell death 1 ligand 1,CHEMBL3580522,SINGLE PROTEIN,Q9NZQ7
14,Homo sapiens,Palmitoleoyl-protein carboxylesterase NOTUM,CHEMBL3714531,SINGLE PROTEIN,Q6P988
13,Homo sapiens,Cytosolic phospholipase A2,CHEMBL3816,SINGLE PROTEIN,P47712
0,SARS coronavirus,SARS coronavirus 3C-like proteinase,CHEMBL3927,SINGLE PROTEIN,P0C6U8
29,Penicillium janthinellum,Penicillopepsin,CHEMBL4254,SINGLE PROTEIN,P00798


In [9]:
# Check organisms
targets.groupby('organism').size()

organism
Homo sapiens                8
Human coronavirus NL63      1
Penicillium janthinellum    1
Plasmodium falciparum       1
SARS coronavirus            1
dtype: int64

## 4. Get ChEMBL bioactivities

Get bioactivity data from ChEMBL that are linked to the target ChEMBL IDs.

In [10]:
bioactivities = bioactivities_by_target_chembl_ids(targets.target_chembl_id)
bioactivities.shape

Progress: 0/12
Progress: 10/12


(5913, 11)

In [11]:
# Standardize bioactivities (convert IC50 values to nM and calculate pIC50)
bioactivities = standardize_bioactivities(bioactivities)
bioactivities.shape

(5913, 12)

## 5. Filter ChEMBL bioactivities

### Filter only for entries reaching a defined bioactivity threshold

In [12]:
bioactivities_active = bioactivities[bioactivities.pIC50 >= PIC50_CUTOFF]
bioactivities_active.shape

(4606, 12)

### Some molecules have multiple bioactivity measures, keep here only best measurement.

In [13]:
bioactivities_active = bioactivities_active.sort_values(
    ['molecule_chembl_id', 'pIC50'], 
    ascending=False
)

bioactivities_active.drop_duplicates(
    'molecule_chembl_id',
    keep='first',
    inplace=True
)
bioactivities_active.shape

(4121, 12)

In [14]:
bioactivities_active.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,type,units,IC50,pIC50
2,278634,CHEMBL764691,In vitro concentration required for inhibition...,B,CHEMBL98043,=,CHEMBL4414,Plasmodium falciparum,IC50,nM,123.0,6.910095
12,666081,CHEMBL820842,Inhibition of Human cPLA2 alpha using Enzyme a...,B,CHEMBL9277,=,CHEMBL3816,Homo sapiens,IC50,nM,78.0,7.107905
2,638119,CHEMBL820842,Inhibition of Human cPLA2 alpha using Enzyme a...,B,CHEMBL9161,=,CHEMBL3816,Homo sapiens,IC50,nM,2.1,8.677781
3,639330,CHEMBL820842,Inhibition of Human cPLA2 alpha using Enzyme a...,B,CHEMBL9021,=,CHEMBL3816,Homo sapiens,IC50,nM,420.0,6.376751
8,650073,CHEMBL820842,Inhibition of Human cPLA2 alpha using Enzyme a...,B,CHEMBL8973,=,CHEMBL3816,Homo sapiens,IC50,nM,5.3,8.275724


## 6. Get ChEMBL molecules

In [15]:
molecules = molecules_by_molecule_chembl_ids(bioactivities_active.molecule_chembl_id)
molecules.shape

Progress 2020-03-25 17:25:28.001834: 0/4121
Progress 2020-03-25 17:25:32.704314: 1000/4121
Progress 2020-03-25 17:25:37.196284: 2000/4121
Progress 2020-03-25 17:25:41.154708: 3000/4121
Progress 2020-03-25 17:25:45.087334: 4000/4121


(4121, 4)

## 7. Combine and save molecule/bioactivity/target data

In [16]:
dataset = pd.merge(
    bioactivities_active, 
    molecules, 
    on='molecule_chembl_id',
    how='left'
)
dataset.shape

(4121, 15)

In [17]:
dataset = pd.merge(
    dataset, 
    targets, 
    on='target_chembl_id',
    how='left'
)
dataset.shape

(4121, 19)

In [18]:
dataset.to_csv(UNIPROT_IDS_PATH.parent / 'chembl_molecules_by_uniprot_ids.csv', index=False)