In [49]:
import collections
import gzip
import time
import math
import itertools

import pandas
import numpy
import sqlite3

import unichem

In [2]:
connection = sqlite3.connect('file:data/l1000.db', uri=True)
cursor = connection.cursor()

In [3]:
query = """
SELECT * FROM perts
WHERE pert_type == 'trt_cp'
AND inchi_string NOTNULL
ORDER BY inchi_key;
"""
pert_df = pandas.read_sql(query, connection)

In [4]:
pert_df.head()

Unnamed: 0,pert_uid,pert_id,pert_iname,pert_type,num_gold,num_inst,num_sig,in_summly,inchi_string,inchi_key,pubchem_cid
0,39929,BRD-K13087974,"4,5-dianilinophthalimide",trt_cp,13,65,18,1,InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-...,AAALVYBICLMAMA-UHFFFAOYSA-N,1697
1,9777,BRD-K03568209,MW-STK33-4C,trt_cp,14,120,32,1,InChI=1S/C12H11NO3/c14-7-3-4-10-9(6-7)8-2-1-5-...,AACFPJSJOWQNBN-UHFFFAOYSA-N,755673
2,2195,BRD-K50417881,eticlopride,trt_cp,4,64,19,0,InChI=1S/C17H25ClN2O3/c1-4-11-9-13(18)16(23-3)...,AADCDMQTJNYOSS-LBPRGKRZSA-N,6917728
3,47318,BRD-K70633610,BRD-K70633610,trt_cp,0,8,3,0,InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9...,AADVJQLQUVDEBP-GQIGUUNPSA-N,54631316
4,50497,BRD-K95549765,BRD-K95549765,trt_cp,2,8,3,0,InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9...,AADVJQLQUVDEBP-GUXCAODWSA-N,54631309


In [5]:
src_df = pandas.DataFrame(list(unichem.id_to_source.items()), columns=['src_id', 'resource'])
src_df.src_id = src_df.src_id.astype(str)
src_df.head()

Unnamed: 0,src_id,resource
0,0,
1,1,chembl
2,2,drugbank
3,3,pdb
4,4,iuphar


In [26]:
#cursor.execute('DROP TABLE unichem')

In [27]:
# Create table of mappings
command = '''
CREATE TABLE IF NOT EXISTS unichem
(
    pert_uid INTEGER NOT NULL,
    query_inchi_key TEXT NOT NULL,
    resource TEXT NOT NULL,
    resource_id TEXT NOT NULL,
    C INTEGER,
    b INTEGER,
    i INTEGER,
    m INTEGER,
    p INTEGER,
    s INTEGER,
    t INTEGER,
    FOREIGN KEY(pert_uid) REFERENCES perts(pert_uid)
);
'''
cursor.execute(command)

unichem_columns = [col[1] for col in cursor.execute('PRAGMA table_info(unichem);')]
unichem_columns

['pert_uid',
 'query_inchi_key',
 'resource',
 'resource_id',
 'C',
 'b',
 'i',
 'm',
 'p',
 's',
 't']

In [29]:
for i, series in pert_df.iterrows():
    print(series.inchi_key, '\r', end='')
    matches = list(unichem.key_search(series.inchi_key, C=4))
    if not matches:
        fikhb, sikhb, tikhb = series.inchi_key.split('-')
        matches = list(unichem.key_search(fikhb, C=4))
    if not matches:
        continue
    map_df = pandas.DataFrame(matches)
    map_df['pert_uid'] = series.pert_uid
    map_df = map_df.merge(src_df)
    map_df.src_compound_id = map_df.src_compound_id.astype(str)
    map_df = map_df.rename(columns={'src_compound_id': 'resource_id', 'Query_InChIKey': 'query_inchi_key'})
    map_df = map_df[unichem_columns]
    map_df.to_sql('unichem', connection, if_exists='append', index=False)
    connection.commit()
    time.sleep(2)

UniChem error: No standard InChI exists in UniChem with an InChIKey of 'ABOPFQCYXIJSPV-DRJFBDSASA-N',  BUT others exist with an InChIKey connection layer of 'ABOPFQCYXIJSPV' (such as 'ABOPFQCYXIJSPV-VOBJICJNSA-N'). Try re-querying with the connection layer alone. 
UniChem error: No standard InChI exists in UniChem with an InChIKey of 'AHMXYULEXZAPCS-VWLOTQADSA-N',  BUT others exist with an InChIKey connection layer of 'AHMXYULEXZAPCS' (such as 'AHMXYULEXZAPCS-UHFFFAOYSA-N'). Try re-querying with the connection layer alone. 
UniChem error: No standard InChI exists in UniChem with an InChIKey of 'AJBUMCZTMHNJAQ-JKHIJQBDSA-N',  BUT others exist with an InChIKey connection layer of 'AJBUMCZTMHNJAQ' (such as 'AJBUMCZTMHNJAQ-UHFFFAOYSA-N'). Try re-querying with the connection layer alone. 
UniChem error: No standard InChI exists in UniChem with an InChIKey of 'AJBUMCZTMHNJAQ-NUNAXRQHSA-N',  BUT others exist with an InChIKey connection layer of 'AJBUMCZTMHNJAQ' (such as 'AJBUMCZTMHNJAQ-UHFFFA

In [34]:
# show head of unichem table to verify integrity
pandas.read_sql("SELECT * FROM unichem LIMIT 5", connection)

Unnamed: 0,pert_uid,query_inchi_key,resource,resource_id,C,b,i,m,p,s,t
0,39929,AAALVYBICLMAMA-UHFFFAOYSA-N,ibm,AE722659292C1007454FEEFE96518DF6,0,0,0,0,0,0,0
1,39929,AAALVYBICLMAMA-UHFFFAOYSA-N,pubchem_tpharma,14826595,0,0,0,0,0,0,0
2,39929,AAALVYBICLMAMA-UHFFFAOYSA-N,chebi,53110,0,0,0,0,0,0,0
3,39929,AAALVYBICLMAMA-UHFFFAOYSA-N,actor,145915-58-8,0,0,0,0,0,0,0
4,39929,AAALVYBICLMAMA-UHFFFAOYSA-N,pubchem,1697,0,0,0,0,0,0,0


In [35]:
connection.close()

# Example usage

In [3]:
connection = sqlite3.connect('file:data/l1000.db?mode=ro', uri=True)

In [65]:
def get_unichem_mapping(connection, resource, resource_ids):
    """
    Provide a database connection, a resource (such as 'drugbank' or 'pubchem'),
    and specific compound identifiers of that resouce
    to receive all L1000 compounds mapped via UniChem.
    """
    query = '''
    SELECT perts.pert_id, resource, resource_id, C, b, i, m, p, s, t, perts.inchi_key as l1000_inchi_key, query_inchi_key
    FROM unichem, perts
    WHERE unichem.pert_uid = perts.pert_uid
    AND unichem.resource = ?
    AND unichem.resource_id IN ({});
    '''
    chunk_size = 900
    dfs = list()
    resource_ids = [str(x) for x in resource_ids]
    for i in range(math.ceil(len(resource_ids) / chunk_size)):
        resource_ids_part = resource_ids[i * chunk_size : (i + 1) * chunk_size]
        params = [resource] + resource_ids_part
        subquery = query.format(','.join(['?'] * len(resource_ids_part)))
        df = pandas.read_sql(subquery, connection, params=params)
        dfs.append(df)
    unichem_df = pandas.concat(dfs).drop_duplicates()
    return unichem_df

In [66]:
get_unichem_mapping(connection, 'drugbank', ['DB00945', 'DB00787', 'DB01048', 'DB00659', 'DB00316'])

Unnamed: 0,pert_id,resource,resource_id,C,b,i,m,p,s,t,l1000_inchi_key,query_inchi_key
0,BRD-K11433652,drugbank,DB00945,0,0,0,0,0,0,0,BSYNRYMUTXBXSQ-UHFFFAOYSA-N,BSYNRYMUTXBXSQ-UHFFFAOYSA-N
1,BRD-K32318651,drugbank,DB00787,0,0,0,0,0,0,0,MKUXAQIIEYXACX-UHFFFAOYSA-N,MKUXAQIIEYXACX-UHFFFAOYSA-N
2,BRD-K41524689,drugbank,DB00316,0,0,0,0,0,0,0,RZVAJINKPMORJF-UHFFFAOYSA-N,RZVAJINKPMORJF-UHFFFAOYSA-N


In [69]:
get_unichem_mapping(connection, 'drugbank', ['DB00945', 'DB00787', 'DB01048', 'DB00659', 'DB00316'] * 2000)

Unnamed: 0,pert_id,resource,resource_id,C,b,i,m,p,s,t,l1000_inchi_key,query_inchi_key
0,BRD-K11433652,drugbank,DB00945,0,0,0,0,0,0,0,BSYNRYMUTXBXSQ-UHFFFAOYSA-N,BSYNRYMUTXBXSQ-UHFFFAOYSA-N
1,BRD-K32318651,drugbank,DB00787,0,0,0,0,0,0,0,MKUXAQIIEYXACX-UHFFFAOYSA-N,MKUXAQIIEYXACX-UHFFFAOYSA-N
2,BRD-K41524689,drugbank,DB00316,0,0,0,0,0,0,0,RZVAJINKPMORJF-UHFFFAOYSA-N,RZVAJINKPMORJF-UHFFFAOYSA-N


In [70]:
connection.close()