# Add ChEMBL target classification

In [1]:
from local_utils.file_utils import backup_file

## Configuration

In [2]:
# ChEMBL connection...

engine = create_engine(open('database.txt').read().strip())

## Target classification

Using a portion of the full ChEMBL target classification hierarchy in the table '`target_class`'.

Note that classification is only performed on Human targets; the classification of Rat targets in ChEMBL is very patchy, and the by-symbol classification of Human targets may easily be applied to the Rat targets.

In [3]:
target_class = pd.read_sql_query("""
select
  distinct
    x.chembl_id
  , x.pref_name	
  , x.symbol
  , x.approved_name	
  , b.l1
        || case when b.l2 is not null then ' > ' || b.l2
            || case when b.L2 = 'Kinase' and b.L3 is not null then ' > ' || b.L3
            else '' end
        else '' end
    as target_class
from
  tt_chembl_targets x
  join chembl_20_app.chembl_id_lookup a on x.chembl_id = a.chembl_id
  join chembl_20_app.target_class b on a.entity_id = b.tid
where
  a.entity_type = 'TARGET'
and x.species = 'Human'
and x.exclude = 0
order by
    x.chembl_id
  , target_class
""", engine)

target_class.shape

(258, 5)

In [4]:
symbols = ['ABCG5', 'ABCG8', 'ATP1A1', 'ATP1A2', 'ATP1A3', 'ATP1A4', 'ATP1B1', 'ATP1B2', 'ATP1B3', 'ATP1B4', 'ATP8B1', 'KCNA7', 'KCNC4', 'KCNE1', 'KCNJ12', 'KCNJ3', 'KCNJ5', 'KCNK1', 'KCNK4', 'KCNK6', 'NPR2', 'SLC29A2', 'SLC4A2', 'SLC51B']

target_class.query('symbol in @symbols')

Unnamed: 0,chembl_id,pref_name,symbol,approved_name,target_class


In [4]:
HTML(target_class.head().to_html())

Unnamed: 0,chembl_id,pref_name,symbol,approved_name,target_class
0,CHEMBL1075094,Nuclear factor erythroid 2-related factor 2,NFE2L2,"nuclear factor, erythroid 2-like 2",Unclassified protein
1,CHEMBL1250417,Potassium/sodium hyperpolarization-activated cyclic nucleotide-gated channel 4,HCN4,hyperpolarization activated cyclic nucleotide gated potassium channel 4,Ion channel > Voltage-gated ion channel
2,CHEMBL1287617,UDP-glucuronosyltransferase 1-1,UGT1A1,"UDP glucuronosyltransferase 1 family, polypeptide A1",Enzyme
3,CHEMBL1628481,Apelin receptor,APLNR,apelin receptor,Membrane receptor > Family A G protein-coupled receptor
4,CHEMBL1641347,Solute carrier family 22 member 6,SLC22A6,"solute carrier family 22 (organic anion transporter), member 6",Transporter > Electrochemical transporter


### Targets with Multiple classes

In [5]:
# Inspect cases where there are multiple classes for a target...

def f(x):
    
    return pd.Series({'target_classes': x['target_class'].tolist() if x.shape[0] > 1 else None})

df = target_class.groupby(['chembl_id', 'pref_name']).apply(f).dropna(how='all').reset_index()

df

Unnamed: 0,chembl_id,pref_name,target_classes
0,CHEMBL1907601,Cyclin-dependent kinase 4/cyclin D1,"[Enzyme > Kinase > Protein Kinase, Other cytosolic protein]"
1,CHEMBL1907605,Cyclin-dependent kinase 2/cyclin E1,"[Enzyme > Kinase > Protein Kinase, Other cytosolic protein]"
2,CHEMBL2094126,Cyclin-dependent kinase 2/cyclin E,"[Enzyme > Kinase > Protein Kinase, Other cytosolic protein]"
3,CHEMBL2094128,Cyclin-dependent kinase 2/cyclin A,"[Enzyme > Kinase > Protein Kinase, Other cytosolic protein]"
4,CHEMBL2095198,"Sulfonylurea receptor 2, Kir6.2","[Ion channel > Voltage-gated ion channel, Transporter > Primary active transporter]"
5,CHEMBL2095942,Cyclin-dependent kinase 4/cyclin D,"[Enzyme > Kinase > Protein Kinase, Other cytosolic protein]"
6,CHEMBL2096972,"Sulfonylurea receptor 1, Kir6.2","[Ion channel > Voltage-gated ion channel, Transporter > Primary active transporter]"
7,CHEMBL2107838,"Calcitonin-gene-related peptide receptor, CALCRL/RAMP1","[Membrane receptor > Family B G protein-coupled receptor, Other membrane protein]"
8,CHEMBL2109232,Adrenomedullin receptor AM1; CALCRL/RAMP2,"[Membrane receptor > Family B G protein-coupled receptor, Other membrane protein]"
9,CHEMBL2111189,"Amylin receptor AMY1, CALCR/RAMP1","[Membrane receptor > Family B G protein-coupled receptor, Other membrane protein]"


### Filter out less-useful classifications

These tend to be for ancilliary proteins, regulator subunits _etc._ and therefor noty always very informative.

In [6]:
reject = {
      'Other cytosolic protein'
    , 'Other membrane protein'
    , 'Unclassified protein'
    , 'Enzyme > Kinase'
    , 'Enzyme > Kinase > Protein kinase regulatory subunit'
}

chembl_ids = df.chembl_id.tolist()

target_class = target_class.query("~((target_class in @reject) and (chembl_id in @chembl_ids))")

In [7]:
# Inspect remaining cases where there are multiple classes for a target...

target_class.groupby(['chembl_id', 'pref_name']).apply(f).dropna(how='all').reset_index()

Unnamed: 0,chembl_id,pref_name,target_classes
0,CHEMBL2095198,"Sulfonylurea receptor 2, Kir6.2","[Ion channel > Voltage-gated ion channel, Transporter > Primary active transporter]"
1,CHEMBL2096972,"Sulfonylurea receptor 1, Kir6.2","[Ion channel > Voltage-gated ion channel, Transporter > Primary active transporter]"
2,CHEMBL2221341,Mammalian target of Rapamycin (mTORC1),"[Enzyme, Enzyme > Isomerase]"
3,CHEMBL2221347,"Voltage-gated potassium channel, IKs; KCNQ1(Kv7.1)/KCNE1(MinK)","[Auxiliary transport protein > Slow voltage-gated potassium channel accessory protein family, Ion channel > Voltage-gated ion channel]"


In [8]:
# Deal with these cases individually...

target_class = target_class[~(target_class['pref_name'].str.contains('^Sulfonylurea receptor') & (target_class['target_class'] == 'Transporter > Primary active transporter'))] # Leaves 'Ion channel > Voltage-gated ion channel'
 
target_class = target_class[~(target_class['pref_name'].str.contains('mTORC1') & (target_class['target_class'] == 'Enzyme > Isomerase'))] # Leaves 'Enzyme'

target_class = target_class[~(target_class['pref_name'].str.contains('MinK') & (target_class['target_class'] == 'Auxiliary transport protein > Slow voltage-gated potassium channel accessory protein family'))] # Leaves 'Ion channel > Voltage-gated ion channel'

In [9]:
# Check whether there are any remaining cases where there are multiple classes for a target...

target_class.groupby(['chembl_id', 'pref_name']).apply(f).dropna(how='all').shape[0]

0

In [5]:
chembl_ids = ('CHEMBL2221347', 'CHEMBL3038488', 'CHEMBL3038489')

target_class.query("chembl_id in @chembl_ids")

Unnamed: 0,chembl_id,pref_name,symbol,approved_name,target_class
117,CHEMBL2221347,"Voltage-gated potassium channel, IKs; KCNQ1(Kv7.1)/KCNE1(MinK)",KCNQ1,"potassium channel, voltage gated KQT-like subfamily Q, member 1",Auxiliary transport protein > Slow voltage-gated potassium channel accessory protein family
118,CHEMBL2221347,"Voltage-gated potassium channel, IKs; KCNQ1(Kv7.1)/KCNE1(MinK)",KCNQ1,"potassium channel, voltage gated KQT-like subfamily Q, member 1",Ion channel > Voltage-gated ion channel


### Misc fixes

* [HNF4A](http://en.wikipedia.org/wiki/Hepatocyte_nuclear_factor_4_alpha): nuclear receptor subfamily 2, group A, member 1


* [NFE2L2](http://en.wikipedia.org/wiki/NFE2L2): a transcription factor, but further classification not clear

In [10]:
target_class.query("symbol in ['HNF4A', 'NFE2L2']")

Unnamed: 0,chembl_id,pref_name,symbol,approved_name,target_class
0,CHEMBL1075094,Nuclear factor erythroid 2-related factor 2,NFE2L2,"nuclear factor, erythroid 2-like 2",Unclassified protein
244,CHEMBL5398,Hepatocyte nuclear factor 4-alpha,HNF4A,"hepatocyte nuclear factor 4, alpha",Unclassified protein


In [11]:
target_class.loc[target_class['symbol'] == 'HNF4A', 'target_class'] = 'Transcription factor > Nuclear receptor'

target_class.loc[target_class['symbol'] == 'NFE2L2', 'target_class'] = 'Transcription factor'

### Kinases not classified as such

* The kinase component of mTOR ([FRAP1](http://en.wikipedia.org/wiki/Phosphatidylinositol_3-kinase-related_kinase)) is an atypical protein kinase related to the PI3 kinases.


* The PIK3C* family are PI3 Kinases

In [12]:
target_class[((target_class.symbol == 'MTOR') | target_class.symbol.str.contains('^PIK3C'))]

Unnamed: 0,chembl_id,pref_name,symbol,approved_name,target_class
101,CHEMBL2111367,PI3-kinase p110-alpha/p85-alpha,PIK3CA,"phosphatidylinositol-4,5-bisphosphate 3-kinase, catalytic subunit alpha",Enzyme
115,CHEMBL2221341,Mammalian target of Rapamycin (mTORC1),MTOR,mechanistic target of rapamycin (serine/threonine kinase),Enzyme
156,CHEMBL2842,FK506 binding protein 12,MTOR,mechanistic target of rapamycin (serine/threonine kinase),Enzyme
197,CHEMBL3267,PI3-kinase p110-gamma subunit,PIK3CG,"phosphatidylinositol-4,5-bisphosphate 3-kinase, catalytic subunit gamma",Enzyme
214,CHEMBL4005,PI3-kinase p110-alpha subunit,PIK3CA,"phosphatidylinositol-4,5-bisphosphate 3-kinase, catalytic subunit alpha",Enzyme


In [13]:
target_class.loc[((target_class.symbol == 'MTOR') | target_class.symbol.str.contains('^PIK3C')), 'target_class'] = 'Enzyme > Kinase > PI3 Kinase'

### Phase II ADME genes

CYPs and transporters are currently classified.

In [14]:
adme_genes = pd.read_excel('Toxicology-associated_targets.xlsx', sheetname='Table 11')['Gene']

target_class[target_class['symbol'].isin(adme_genes) & (target_class['target_class'] == 'Enzyme')]

Unnamed: 0,chembl_id,pref_name,symbol,approved_name,target_class
2,CHEMBL1287617,UDP-glucuronosyltransferase 1-1,UGT1A1,"UDP glucuronosyltransferase 1 family, polypeptide A1",Enzyme
15,CHEMBL1743291,Sulfotransferase 1A1,SULT1A1,"sulfotransferase family, cytosolic, 1A, phenol-preferring, member 1",Enzyme
76,CHEMBL2081,Glutathione S-transferase Mu 1,GSTM1,glutathione S-transferase mu 1,Enzyme
105,CHEMBL2141,Glutathione S-transferase theta 1,GSTT1,glutathione S-transferase theta 1,Enzyme
111,CHEMBL2194,Arylamine N-acetyltransferase 2,NAT2,N-acetyltransferase 2 (arylamine N-acetyltransferase),Enzyme
144,CHEMBL2500,Thiopurine S-methyltransferase,TPMT,thiopurine S-methyltransferase,Enzyme
193,CHEMBL3172,Dihydropyrimidine dehydrogenase,DPYD,dihydropyrimidine dehydrogenase,Enzyme
212,CHEMBL3902,Glutathione S-transferase Pi,GSTP1,glutathione S-transferase pi 1,Enzyme
226,CHEMBL4370,UDP-glucuronosyltransferase 2B7,UGT2B7,"UDP glucuronosyltransferase 2 family, polypeptide B7",Enzyme
235,CHEMBL4978,UDP-glucuronosyltransferase 2B17,UGT2B17,"UDP glucuronosyltransferase 2 family, polypeptide B17",Enzyme


In [15]:
target_class.loc[target_class['symbol'].isin(adme_genes) & (target_class['target_class'] == 'Enzyme'), 'target_class'] = 'Enzyme > Phase II'

### Enzymes that are not ADME-related

_NB_ No fixes currently applied here.

In [16]:
target_class[(target_class['target_class'] == 'Enzyme') & ~target_class['symbol'].isin(adme_genes)]

Unnamed: 0,chembl_id,pref_name,symbol,approved_name,target_class
194,CHEMBL3189,Adenylate cyclase type V,ADCY5,adenylate cyclase 5,Enzyme
232,CHEMBL4803,"Nitric-oxide synthase, endothelial",NOS3,nitric oxide synthase 3 (endothelial cell),Enzyme


### Target class counts

In [17]:
class_counts = pd.DataFrame(target_class['target_class'].value_counts(), columns=['N']).reset_index()

class_counts.columns.values[0] = 'target_class'

class_counts.sort('N', ascending=False)

Unnamed: 0,target_class,N
0,Enzyme > Kinase > Protein Kinase,57
1,Membrane receptor > Family A G protein-coupled receptor,49
2,Ion channel > Voltage-gated ion channel,19
3,Transporter > Electrochemical transporter,18
4,Transcription factor > Nuclear receptor,16
5,Enzyme > Phase II,12
6,Enzyme > Cytochrome P450,11
7,Ion channel > Ligand-gated ion channel,11
8,Membrane receptor > Family B G protein-coupled receptor,9
9,Transporter > Primary active transporter,8


### Add a simplified 'summary' classification

In [18]:
target_class.rename(columns={'target_class': 'target_class_1'}, inplace=True)

In [19]:
target_class_0 = {
      'Membrane receptor > Family A G protein-coupled receptor':  'Membrane receptor > G protein-coupled receptor'
    , 'Membrane receptor > Family B G protein-coupled receptor':  'Membrane receptor > G protein-coupled receptor'
    , 'Ion channel > Voltage-gated ion channel':                  'Ion channel > Voltage-gated ion channel'
    , 'Ion channel > Ligand-gated ion channel':                   'Ion channel > Ligand-gated ion channel'
    , 'Transporter > Primary active transporter':                 'Transporter > Primary active transporter'
    , 'Transporter > Electrochemical transporter':                'Transporter > Electrochemical transporter'
    , 'Transcription factor > Nuclear receptor':                  'Transcription factor'
    , 'Transcription factor':                                     'Transcription factor'
    , 'Enzyme > Kinase > Protein Kinase':                         'Enzyme > Kinase > Protein Kinase'
    , 'Enzyme > Kinase > PI3 Kinase':                             'Enzyme > Kinase > PI3 Kinase'
    , 'Enzyme > Cytochrome P450':                                 'Enzyme > Cytochrome P450'
    , 'Enzyme > Phase II':                                        'Enzyme > Phase II'
    , 'Enzyme':                                                   'Enzyme'
    , 'Enzyme > Oxidoreductase':                                  'Enzyme'
    , 'Enzyme > Protease':                                        'Enzyme'
    , 'Enzyme > Phosphodiesterase':                               'Enzyme'
    , 'Enzyme > Lyase':                                           'Enzyme'
    , 'Enzyme > Hydrolase':                                       'Enzyme'
}

target_class['target_class_0'] = target_class['target_class_1'].apply(lambda x: target_class_0[x])

target_class = target_class[list(target_class.columns[:-2]) + list(target_class.columns[[-1, -2]])]

In [20]:
target_class.shape

(229, 6)

## Simplified class counts

In [21]:
class_counts_0 = pd.DataFrame(target_class['target_class_0'].value_counts(), columns=['N']).reset_index()

class_counts_0.columns.values[0] = 'target_class_0'

class_counts_0.sort('N', ascending=False)

Unnamed: 0,target_class_0,N
0,Membrane receptor > G protein-coupled receptor,58
1,Enzyme > Kinase > Protein Kinase,57
2,Ion channel > Voltage-gated ion channel,19
3,Transporter > Electrochemical transporter,18
4,Transcription factor,18
5,Enzyme,12
6,Enzyme > Phase II,12
7,Enzyme > Cytochrome P450,11
8,Ion channel > Ligand-gated ion channel,11
9,Transporter > Primary active transporter,8


### Save classification

In [22]:
# Double-check each symbol has at most one class...

assert target_class[['symbol', 'target_class_0']].drop_duplicates()['symbol'].duplicated().any() == False

In [23]:
target_class.to_pickle('target_class.pkl')

In [11]:
# Simplified mapping for convenience...

target_class_map = target_class[['symbol', 'target_class_0']].drop_duplicates().set_index('symbol')

target_class_map.columns = ['target_class']

target_class_map.to_pickle('target_class_map.pkl')

In [24]:
# # Write to RDBMS...

# col_types = {
#     'chembl_id':      VARCHAR2(20),
#     'pref_name':      VARCHAR2(200),
#     'symbol':         VARCHAR2(10),
#     'approved_name':  VARCHAR2(1000),
#     'target_class_0': VARCHAR2(1000),
#     'target_class_1': VARCHAR2(1000)
# }

# target_class.to_sql('tt_target_class', engine, if_exists='replace', index=False, dtype=col_types)

## Merge into target data

In [25]:
targets = pd.read_pickle('chembl_targets.pkl') 

targets.shape

(377, 9)

In [26]:
cols = ['chembl_id', 'target_class_0', 'target_class_1']

targets = targets.merge(target_class[cols], on='chembl_id', how='left').fillna('N/A')

targets.shape

(377, 11)

In [27]:
HTML(targets.head().to_html())

Unnamed: 0,symbol,approved_name,targets,n_target,chembl_id,target_type,pref_name,species,exclude,target_class_0,target_class_1
0,ABCB1,"ATP-binding cassette, sub-family B (MDR/TAP), member 1","MDR1;7|MDR1;8|ATP-binding cassette, sub-family B (MDR/TAP), member 1;11",1,CHEMBL4302,SINGLE PROTEIN,P-glycoprotein 1,Human,0,Transporter > Primary active transporter,Transporter > Primary active transporter
1,ABCB1,"ATP-binding cassette, sub-family B (MDR/TAP), member 1","MDR1;7|MDR1;8|ATP-binding cassette, sub-family B (MDR/TAP), member 1;11",1,CHEMBL1075229,SINGLE PROTEIN,Multidrug resistance protein 1,Rat,0,,
2,ABCB11,"ATP-binding cassette, sub-family B (MDR/TAP), member 11",BSEP;6|BSEP;7|BSEP;8,1,CHEMBL6020,SINGLE PROTEIN,Bile salt export pump,Human,0,Transporter > Primary active transporter,Transporter > Primary active transporter
3,ABCB11,"ATP-binding cassette, sub-family B (MDR/TAP), member 11",BSEP;6|BSEP;7|BSEP;8,1,CHEMBL2073674,SINGLE PROTEIN,Bile salt export pump,Rat,0,,
4,ABCB4,"ATP-binding cassette, sub-family B (MDR/TAP), member 4",MDR3;7|MDR3;8,1,CHEMBL1743129,SINGLE PROTEIN,Multidrug resistance protein 3,Human,0,Transporter > Primary active transporter,Transporter > Primary active transporter


### Save/Restore

File now includes target classes.

In [28]:
# backup_file('chembl_targets.pkl')

targets.to_pickle('chembl_targets.pkl')