In [1]:
import collections
import gzip
import time

import pandas
import numpy
import sqlite3

import unichem

In [2]:
connection = sqlite3.connect('file:data/l1000.db', uri=True)
cursor = connection.cursor()

In [3]:
query = """
SELECT * FROM perts
WHERE pert_type == 'trt_cp'
AND inchi_string NOTNULL
ORDER BY inchi_key;
"""
pert_df = pandas.read_sql(query, connection).head()

In [4]:
pert_df.head()

Unnamed: 0,pert_uid,pert_id,pert_iname,pert_type,num_gold,num_inst,num_sig,in_summly,inchi_string,inchi_key,pubchem_cid
0,39929,BRD-K13087974,"4,5-dianilinophthalimide",trt_cp,13,65,18,1,InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-...,AAALVYBICLMAMA-UHFFFAOYSA-N,1697
1,9777,BRD-K03568209,MW-STK33-4C,trt_cp,14,120,32,1,InChI=1S/C12H11NO3/c14-7-3-4-10-9(6-7)8-2-1-5-...,AACFPJSJOWQNBN-UHFFFAOYSA-N,755673
2,2195,BRD-K50417881,eticlopride,trt_cp,4,64,19,0,InChI=1S/C17H25ClN2O3/c1-4-11-9-13(18)16(23-3)...,AADCDMQTJNYOSS-LBPRGKRZSA-N,6917728
3,47318,BRD-K70633610,BRD-K70633610,trt_cp,0,8,3,0,InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9...,AADVJQLQUVDEBP-GQIGUUNPSA-N,54631316
4,50497,BRD-K95549765,BRD-K95549765,trt_cp,2,8,3,0,InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9...,AADVJQLQUVDEBP-GUXCAODWSA-N,54631309


In [5]:
src_df = pandas.DataFrame(list(unichem.id_to_source.items()), columns=['src_id', 'resource'])
src_df.src_id = src_df.src_id.astype(str)
src_df.head()

Unnamed: 0,src_id,resource
0,0,
1,1,chembl
2,2,drugbank
3,3,pdb
4,4,iuphar


In [6]:
# Create table of mappings
command = '''
CREATE TABLE IF NOT EXISTS unichem
(
    pert_uid INTEGER,
    resource TEXT,
    resource_id TEXT,
    C INTEGER,
    b INTEGER,
    i INTEGER,
    m INTEGER,
    p INTEGER,
    s INTEGER,
    t INTEGER,
    PRIMARY KEY(pert_uid, resource, resource_id),
    FOREIGN KEY(pert_uid) REFERENCES perts(pert_uid)
);
'''
cursor.execute(command)

unichem_columns = [col[1] for col in cursor.execute('PRAGMA table_info(unichem);')][1:]
unichem_columns

['resource', 'resource_id', 'C', 'b', 'i', 'm', 'p', 's', 't']

In [7]:
for i, series in pert_df.iterrows():
    print(series.inchi_key, '\r', end='')
    map_df = pandas.DataFrame(unichem.key_search(series.inchi_key, C=4))
    map_df = map_df.merge(src_df)
    map_df.src_compound_id = map_df.src_compound_id.astype(str)
    map_df = map_df.rename(columns={'src_compound_id': 'resource_id'})[unichem_columns]
    map_df.to_sql('unichem', connection, if_exists='append', index=False)
    connection.commit()
    time.sleep(2)



In [None]:
# starting at 18.3 gigabytes