# Load Targets

* Load targets from Excel Workbook
* Get validated HGNC symbols and corresponding approved names
* Get list of unique Symbols

In [1]:
from itertools import combinations, chain

### Configuration

In [2]:
# ChEMBL connection...

engine = create_engine(open('database.txt').read().strip())

In [3]:
verbose = True

### Load targets from Excel Workbook

In [6]:
input_xlsx = 'Toxicology-associated_targets.xlsx'

In [7]:
xl = pd.ExcelFile(input_xlsx)

tmp = []

for sheetname in xl.sheet_names:
    
    df = pd.read_excel(input_xlsx, sheetname=sheetname).iloc[:, 0:2]

    df.columns = ['target', 'genes']
    
    df['table'] = re.sub('Table\s+', '', sheetname)

    tmp.append(df)
    
df0 = pd.concat(tmp).reset_index(drop=True)

df0.shape

(294, 3)

In [8]:
df0

Unnamed: 0,target,genes,table
0,Adenosine receptor A2A,ADORA2A,1.0
1,α1A-adrenergic receptor,ADRA1A,1.0
2,α2A-adrenergic receptor,ADRA2A,1.0
3,β1-adrenergic receptor,ADRB1,1.0
4,β2-adrenergic receptor,ADRB2,1.0
5,Dopamine receptor D1,DRD1,1.0
6,Dopamine receptor D2,DRD2,1.0
7,Endothelin receptor A,EDNRA,1.0
8,Histamine H1 receptor,HRH1,1.0
9,Histamine H2 receptor,HRH2,1.0


In [9]:
# Check there are no non-ASCII characters in the gene names...

for gene in df0['genes']:
        
    try:
        
        gene.decode('ascii')
        
    except UnicodeEncodeError as e:
    
        logging.info(u"Input '{}' contains non-ASCII characters".format(gene))

### Expand gene names into HGNC symbols

Expand gene names into HGCN symbols, check the validity of these and get the corresponding approved names.

In [10]:
# Function to expand gene name(s) from original sources into HGNC symbols (NB validation of these symbols is a separate step)...

def get_symbols(gene):    
        
    symbols = []
        
    chunks = re.split(u'\s*[,&]\s*', gene)
    
    for chunk in chunks:
                
        chunk = chunk.replace(' ', '')
        
        is_range = re.search(u'^(\w*)(\d)-(\d)$', chunk)
        
        if is_range:
            
            stem, i, j = is_range.groups()
            
            suffixes = range(int(i), int(j)+1)
            
        else:
            
            suffix_list = re.search(r'^(\w+)/(.+)$', chunk)
        
            if suffix_list:

                first = suffix_list.group(1)

                suffixes = suffix_list.group(2).split('/')

                n = len(suffixes[0]) # NB Assumption is that all suffixes are the same length

                stem, suffix = first[:-n], first[-n:] # Divide first name

                suffixes.insert(0, suffix) 

            else:
                
                suffixes = None
            
        if suffixes:
                    
            symbols.extend("{}{}".format(stem, x) for x in suffixes)
            
        else: # chunk is a symbol
        
            symbols.append(chunk)

    return symbols

In [11]:
# Convert gene name(s) from original sources into valid HGNC symbols, usings HGNC web service to validate symbols and get their approved name...

cache = {} # Cache of validated HGNC symbols and their approved names

records = []

for n, record in enumerate((dict(x) for _, x in df0.iterrows()), 1):
    
    gene, target = record['genes'], record['target']
    
    symbols = get_symbols(gene)
     
    if verbose: logging.info(u"> {:3d}) target '{}', genes '{}' -> symbols {}".format(n, target, gene, ', '.join("'{}'".format(x) for x in symbols)))
        
    for n_symbol, symbol in enumerate(symbols, 1):
                  
        if symbol not in cache:
        
            results = requests.get("http://rest.genenames.org/fetch/symbol/{}".format(symbol), headers={'accept': 'application/json'}).json()['response']['docs']

            if results:
                
                cache[symbol] = results[0]['name']
                
            else:
            
                logging.warn("Symbol '{}' from gene(s) '{}' for target '{}' in table {} is not recognized by HGNC".format(symbol, gene, target, record['table']))
                
                continue
        
        tmp = record.copy()
        
        tmp.update({'n_symbol': n_symbol, 'symbol': symbol, 'approved_name': cache[symbol]})
                
        records.append(tmp)
                
df1 = pd.DataFrame(records, columns=df0.columns.values.tolist() + ['n_symbol', 'symbol', 'approved_name'])

logging.info("Finished: did {} input records in total.".format(n))

[2015/Jul/07 13:14:45 INFO    ] >   1) target 'Adenosine receptor A2A', genes 'ADORA2A' -> symbols 'ADORA2A'
[2015/Jul/07 13:14:46 INFO    ] >   2) target 'α1A-adrenergic receptor', genes 'ADRA1A' -> symbols 'ADRA1A'
[2015/Jul/07 13:14:46 INFO    ] >   3) target 'α2A-adrenergic receptor', genes 'ADRA2A' -> symbols 'ADRA2A'
[2015/Jul/07 13:14:47 INFO    ] >   4) target 'β1-adrenergic receptor', genes 'ADRB1' -> symbols 'ADRB1'
[2015/Jul/07 13:14:47 INFO    ] >   5) target 'β2-adrenergic receptor', genes 'ADRB2' -> symbols 'ADRB2'
[2015/Jul/07 13:14:48 INFO    ] >   6) target 'Dopamine receptor D1', genes 'DRD1' -> symbols 'DRD1'
[2015/Jul/07 13:14:48 INFO    ] >   7) target 'Dopamine receptor D2', genes 'DRD2' -> symbols 'DRD2'
[2015/Jul/07 13:14:48 INFO    ] >   8) target 'Endothelin receptor A', genes 'EDNRA' -> symbols 'EDNRA'
[2015/Jul/07 13:14:49 INFO    ] >   9) target 'Histamine H1 receptor', genes 'HRH1' -> symbols 'HRH1'
[2015/Jul/07 13:14:49 INFO    ] >  10) target 'Histamine 

In [12]:
df1.shape

(344, 6)

In [13]:
df1.head()

Unnamed: 0,target,genes,table,n_symbol,symbol,approved_name
0,Adenosine receptor A2A,ADORA2A,1,1,ADORA2A,adenosine A2a receptor
1,α1A-adrenergic receptor,ADRA1A,1,1,ADRA1A,adrenoceptor alpha 1A
2,α2A-adrenergic receptor,ADRA2A,1,1,ADRA2A,adrenoceptor alpha 2A
3,β1-adrenergic receptor,ADRB1,1,1,ADRB1,adrenoceptor beta 1
4,β2-adrenergic receptor,ADRB2,1,1,ADRB2,"adrenoceptor beta 2, surface"


In [None]:
# Save/restore...

df1.to_pickle('raw_target_data.pkl')

In [14]:
# df1 = pd.read_pickle('raw_target_data.pkl')

### Get list of unique Symbols

In [15]:
def f(group):
        
    targets = tuple(tuple(x) for x in group[['target', 'table']].values)
            
    return pd.Series({'targets': targets})

symbols = df1.groupby(['symbol', 'approved_name']).apply(f).reset_index().set_index('symbol', drop=False)

symbols.shape

(215, 3)

In [16]:
symbols

Unnamed: 0_level_0,symbol,approved_name,targets
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABCB1,ABCB1,"ATP-binding cassette, sub-family B (MDR/TAP), member 1","((MDR1, 7), (MDR1, 8), (ATP-binding cassette, sub-family B (MDR/TAP), member 1, 11))"
ABCB11,ABCB11,"ATP-binding cassette, sub-family B (MDR/TAP), member 11","((BSEP, 6), (BSEP, 7), (BSEP, 8))"
ABCB4,ABCB4,"ATP-binding cassette, sub-family B (MDR/TAP), member 4","((MDR3, 7), (MDR3, 8))"
ABCC2,ABCC2,"ATP-binding cassette, sub-family C (CFTR/MRP), member 2","((MRP2, 7), (MRP2, 8), (ATP-binding cassette, sub-family C (CFTR/MRP), member 2, 11))"
ABCC3,ABCC3,"ATP-binding cassette, sub-family C (CFTR/MRP), member 3","((MRP3, 7), (MRP3, 8))"
ABCC4,ABCC4,"ATP-binding cassette, sub-family C (CFTR/MRP), member 4","((MRP4, 7), (MRP4, 8))"
ABCC6,ABCC6,"ATP-binding cassette, sub-family C (CFTR/MRP), member 6","((MRP6, 8),)"
ABCG2,ABCG2,"ATP-binding cassette, sub-family G (WHITE), member 2 (Junior blood group)","((BCRP, 7), (BCRP, 8), (ATP-binding cassette, sub-family G (WHITE), member 2, 11))"
ABCG5,ABCG5,"ATP-binding cassette, sub-family G (WHITE), member 5","((ABCG5, 7),)"
ABCG8,ABCG8,"ATP-binding cassette, sub-family G (WHITE), member 8","((ABCG8, 7),)"


In [17]:
# Save/restore...

symbols.to_pickle('unique_symbols.pkl')

In [None]:
# symbols = pd.read_pickle('unique_symbols.pkl')

In [18]:
# Write table to RDBMS...

tmp = symbols.copy()

tmp['targets'] = tmp['targets'].apply(lambda x: u'|'.join(u'{};{}'.format(y, z) for y, z in x).encode('ascii', 'xmlcharrefreplace')) # Convert 'targets' to string representation
 
# Write to database...

col_types = {
    'symbol':        VARCHAR2(10),
    'approved_name': VARCHAR2(1000),
    'targets':       VARCHAR2(1000)
}

tmp.to_sql('tt_symbols', engine, if_exists='replace', index=False, dtype=col_types)

del tmp

### Check overlap of targets

Here, 'targets' is used to mean the targets from the literature, however they may have been specified (and _not_ ChEMBL targets).

Thus, a target specified as 'AKT1, 2 or 3' clearly overlaps with 'PKBa (AKT1)'.

In [19]:
def f(x):
    
    symbols = x['symbol'].values
        
    return pd.Series({'symbols': symbols, 'count': len(symbols)})

df2 = df1.groupby(['target', 'table']).apply(f).reset_index()

In [20]:
# Get targets where there is a 'non-trivial' overlap of symbols...

def f(x, y):
    
    s0, s1 = set(x['symbols']), set(y['symbols'])
    
    if (not s0.intersection(s1) or (s0 == s1) or (len(s0) == 1 and len(s1) == 1)): return ''

    return pd.DataFrame([x, y]).to_html()

html = ''.join(f(x, y) for x, y in ((x[1], y[1]) for x, y in combinations(df2.iterrows(), 2)))
    
HTML(html)

Unnamed: 0,target,table,count,symbols
20,"AKT1, 2 or 3",5.0,3,"[AKT1, AKT2, AKT3]"
213,PKBa (AKT1),2.2,1,[AKT1]

Unnamed: 0,target,table,count,symbols
35,Acetylcholine receptor subunit α1 or α4,1.0,2,"[CHRNA1, CHRNA4]"
189,Nicotinic acetylcholine,2.1,1,[CHRNA1]

Unnamed: 0,target,table,count,symbols
35,Acetylcholine receptor subunit α1 or α4,1.0,2,"[CHRNA1, CHRNA4]"
190,Nicotinic receptor (central),2.2,1,[CHRNA1]

Unnamed: 0,target,table,count,symbols
43,Adenosine Transporter,2.1,1,[SLC29A1]
91,ENT1/2,8.0,2,"[SLC29A1, SLC29A2]"

Unnamed: 0,target,table,count,symbols
45,"Adenosine transporter (AdT, ENT1)",2.2,1,[SLC29A1]
91,ENT1/2,8.0,2,"[SLC29A1, SLC29A2]"

Unnamed: 0,target,table,count,symbols
72,CDK2A,2.2,1,[CDK2]
73,CDKs,5.0,2,"[CDK2, CDK4]"

Unnamed: 0,target,table,count,symbols
93,ERK1/2,5,2,"[MAPK3, MAPK1]"
94,ERK2 (P42mapk),10,1,[MAPK1]

Unnamed: 0,target,table,count,symbols
110,GSK3a,10,1,[GSK3A]
112,GSK3α/β,5,2,"[GSK3A, GSK3B]"

Unnamed: 0,target,table,count,symbols
111,GSK3a,3,1,[GSK3A]
112,GSK3α/β,5,2,"[GSK3A, GSK3B]"

Unnamed: 0,target,table,count,symbols
130,IKs,4,1,[KCNQ1]
222,Potassium voltage-gated channel KQT-like member 1 and minimal potassium channel MinK,1,2,"[KCNQ1, KCNE1]"

Unnamed: 0,target,table,count,symbols
140,KCNQ1,2.2,1,[KCNQ1]
222,Potassium voltage-gated channel KQT-like member 1 and minimal potassium channel MinK,1.0,2,"[KCNQ1, KCNE1]"

Unnamed: 0,target,table,count,symbols
193,OAT2,8,1,[SLC22A7]
200,OATs,7,2,"[SLC22A7, SLC22A9]"

Unnamed: 0,target,table,count,symbols
194,OAT7,8,1,[SLC22A9]
200,OATs,7,2,"[SLC22A7, SLC22A9]"

Unnamed: 0,target,table,count,symbols
195,OATP1A2,8,1,[SLCO1A2]
199,OATPs,7,4,"[SLCO1A2, SLCO1B1, SLCO1B3, SLCO2B1]"

Unnamed: 0,target,table,count,symbols
196,OATP1B1,8,1,[SLCO1B1]
199,OATPs,7,4,"[SLCO1A2, SLCO1B1, SLCO1B3, SLCO2B1]"

Unnamed: 0,target,table,count,symbols
197,OATP1B3,8,1,[SLCO1B3]
199,OATPs,7,4,"[SLCO1A2, SLCO1B1, SLCO1B3, SLCO2B1]"

Unnamed: 0,target,table,count,symbols
198,OATP2B1,8,1,[SLCO2B1]
199,OATPs,7,4,"[SLCO1A2, SLCO1B1, SLCO1B3, SLCO2B1]"

Unnamed: 0,target,table,count,symbols
199,OATPs,7,4,"[SLCO1A2, SLCO1B1, SLCO1B3, SLCO2B1]"
282,"solute carrier organic anion transporter family, member 1B1",11,1,[SLCO1B1]

Unnamed: 0,target,table,count,symbols
199,OATPs,7,4,"[SLCO1A2, SLCO1B1, SLCO1B3, SLCO2B1]"
283,"solute carrier organic anion transporter family, member 1B3",11,1,[SLCO1B3]

Unnamed: 0,target,table,count,symbols
207,PDE3A,3.0,1,[PDE3A]
220,Phosphodiesterase 3 (PDE3),2.2,2,"[PDE3A, PDE3B]"

Unnamed: 0,target,table,count,symbols
220,Phosphodiesterase 3 (PDE3),2.2,2,"[PDE3A, PDE3B]"
221,Phosphodiesterase 3A,1.0,1,[PDE3A]


### Targets corresponding to multiple symbols

Again, here 'targets' means the targets mentioned in the literature and not, _e.g._ ChEMBL targets.

In [21]:
def f(x):
    
    symbols = x['symbol'].values
        
    return pd.DataFrame({'symbols': [symbols], 'count': [len(symbols)]}) # A DataFrame is required in this case: because of the complex groupby key, maybe?

symbols_by_target = symbols.groupby('targets').apply(f).reset_index(1, drop=True).reset_index()

symbols_by_target.query("count > 1").sort('count', ascending=False)

Unnamed: 0,targets,count,symbols
19,"((ATPase (Na+/K+), 3), (ATPase (Na+/K+), 10))",8,"[ATP1A1, ATP1A2, ATP1A3, ATP1A4, ATP1B1, ATP1B2, ATP1B3, ATP1B4]"
74,"((IKP, 4),)",4,"[KCNK1, KCNK3, KCNK4, KCNK6]"
32,"((Aurora kinases, 5),)",3,"[AURKA, AURKB, AURKC]"
78,"((Ito,s, 4),)",3,"[KCNA4, KCNA7, KCNC4]"
145,"((VEGFRs, 5),)",3,"[FLT1, FLT4, KDR]"
62,"((GRK2, GRK5, 5),)",2,"[ADRBK1, GRK5]"
71,"((ICa,T, 4),)",2,"[CACNA1G, CACNA1H]"
72,"((IK1, 4),)",2,"[KCNJ12, KCNJ2]"
73,"((IKAch, 4),)",2,"[KCNJ3, KCNJ5]"
13,"((AMPK, 5),)",2,"[PRKAA1, PRKAA2]"


In Most of these literature targets corresponding to multipe gene symbols are simply families of isoforms, _e.g._ 'Aurora kinases' = AURKA, AURKB and AURKC, or 'VEGFRs' = FLT1, FLT4 and KDR. 

However, in the case of the Ion Channels (from Table 4), this is not always the case. Here, products of more than one gene may contribute to a given current: for example, both [KCNA4](https://en.wikipedia.org/wiki/KCNA4) and [KCNA7](https://en.wikipedia.org/wiki/KCNA7) contribute to I<sub>to,s</sub>.

Whether, in all cases, one isoform can be identified as the 'primary' member of the family and therefore, for simplicity, identified with the current itself isn't clear.

This is something of an issue as the assumption is made elsewhere that there is always a gene-to-target mapping, _i.e._ that each target may be identified with a single gene.

One reason for doing this is to have an unambiguous nomenclature for the targets, as gene names are standardised (by HUGO). The names of multi-component entities, by contrast, seem to be less standardised.

The simple one-to-one mapping obviously holds in many simple cases, such as most enzymes (note that it is not an issue if various members of a family are included).
 
In cases of complexes it is obviously a simplification; however, the hope is that, in most cases, there will be one central, active unit (_i.e._ one that ligands would interact with) that many stand for the complex as a whole.

Whether cases, like the ion channels, where several members of a family actually contribute to the activity may be treated this way remains to be seen.