In [1]:
import rdkit
import rdkit.Chem
import rdkit.Chem.inchi
import rdkit.Chem.AllChem
import rdkit.DataStructs
import pandas
import sqlite3

In [2]:
# Establish database connection
connection = sqlite3.connect('data/l1000.db')
cursor = connection.cursor()

In [3]:
query = """
SELECT * FROM perts
WHERE pert_type == 'trt_cp'
AND inchi_string NOTNULL;
"""
pert_df = pandas.read_sql(query, connection)

In [4]:
pert_df.head()

Unnamed: 0,pert_uid,pert_id,pert_iname,pert_type,num_gold,num_inst,num_sig,in_summly,inchi_string,inchi_key,pubchem_cid
0,12,BRD-K68741898,BRD-K68741898,trt_cp,0,3,1,0,InChI=1S/C21H37N5O5/c1-15(2)13-31-21(29)24(5)1...,MNAJUJYQFCFYAB-YQVWRLOYSA-N,44505553
1,13,BRD-A05457250,BAY-K8644,trt_cp,1,23,8,0,InChI=1S/C16H15F3N2O4/c1-8-12(15(22)25-3)13(14...,ZFLWDHHVRRZMEI-UHFFFAOYSA-N,2303
2,14,BRD-K72034655,peucedanin,trt_cp,2,33,8,0,InChI=1S/C15H14O4/c1-8(2)14-15(17-3)10-6-9-4-5...,YQBNJPACAUPNLV-UHFFFAOYSA-N,8616
3,15,BRD-K02458594,KU-C103869,trt_cp,2,44,13,0,InChI=1S/C15H13NO/c1-11-6-2-5-9-14(11)16-10-12...,YCUIAYUVYLNFFS-UHFFFAOYSA-N,21785456
4,16,BRD-K18814832,BRD-K18814832,trt_cp,1,49,13,0,InChI=1S/C27H23ClN2O3S/c1-18-12-13-19(2)25(16-...,ZQJTYJZLKBRKPC-UHFFFAOYSA-N,2228302


In [5]:
rows = list()

for i, series in pert_df.iterrows():
    # check InChI Keys
    inchi = series.inchi_string
    inchi_key = rdkit.Chem.inchi.InchiToInchiKey(inchi)
    assert inchi_key == series.inchi_key
    
    # molecule
    mol = rdkit.Chem.MolFromInchi(inchi)
    
    # fingerprint
    fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2)
    
    rows.append([series.pert_uid, inchi_key, inchi, mol, fingerprint])

inchi_df = pandas.DataFrame(rows, columns=['pert_uid', 'inchi_key', 'inchi_string', 'mol', 'fingerprint'])
inchi_df.sort('inchi_key', inplace=True)

In [6]:
inchi_df.head()

Unnamed: 0,pert_uid,inchi_key,inchi_string,mol,fingerprint
9081,39929,AAALVYBICLMAMA-UHFFFAOYSA-N,InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-...,<rdkit.Chem.rdchem.Mol object at 0x7f186cce87b8>,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
2664,9777,AACFPJSJOWQNBN-UHFFFAOYSA-N,InChI=1S/C12H11NO3/c14-7-3-4-10-9(6-7)8-2-1-5-...,<rdkit.Chem.rdchem.Mol object at 0x7f186d6d7668>,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
789,2195,AADCDMQTJNYOSS-LBPRGKRZSA-N,InChI=1S/C17H25ClN2O3/c1-4-11-9-13(18)16(23-3)...,<rdkit.Chem.rdchem.Mol object at 0x7f186d71f518>,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
16462,47318,AADVJQLQUVDEBP-GQIGUUNPSA-N,InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9...,<rdkit.Chem.rdchem.Mol object at 0x7f186c668828>,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
19637,50497,AADVJQLQUVDEBP-GUXCAODWSA-N,InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9...,<rdkit.Chem.rdchem.Mol object at 0x7f186c505b38>,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...


In [7]:
# # Delete similarities table
# cursor.execute('DELETE FROM similarities;')
# connection.commit()

In [55]:
pert_uids = list(inchi_df.pert_uid)
fingerprints = list(inchi_df.fingerprint)

for i, series in inchi_df.iterrows():
    print(series.inchi_key, '\r', end='')
    similarities = rdkit.DataStructs.BulkDiceSimilarity(series.fingerprint, fingerprints)
    similarities = [round(x, 4) for x in similarities]
    rows = ((series.pert_uid, pert_uid, similarity) for pert_uid, similarity in zip(pert_uids, similarities))
    print(list(rows)[0])
    #cursor.executemany('INSERT INTO similarities VALUES (?,?,?)', rows)
    #connection.commit()

(39929, 39929, 1.0)
(9777, 39929, 0.2689)
(2195, 39929, 0.1324)
(47318, 39929, 0.2078)
(50497, 39929, 0.2078)
(11127, 39929, 0.2078)
(39508, 39929, 0.2078)
(38442, 39929, 0.2078)
(41911, 39929, 0.2078)
(49330, 39929, 0.4064)
(49013, 39929, 0.4064)
(48238, 39929, 0.4064)
(46035, 39929, 0.4064)
(41606, 39929, 0.4064)
(42253, 39929, 0.4064)
(23302, 39929, 0.0173)
(23851, 39929, 0.3057)
(48777, 39929, 0.2733)
(11662, 39929, 0.4328)
(13503, 39929, 0.3448)
(23498, 39929, 0.4462)
(22582, 39929, 0.2222)
(13926, 39929, 0.3367)
(42873, 39929, 0.3315)
(47678, 39929, 0.3315)
(41469, 39929, 0.3315)
(48965, 39929, 0.3315)


KeyboardInterrupt: 

In [9]:
# close database connection
connection.close()

## Example usage

In [11]:
connection = sqlite3.connect('file:data/l1000.db?mode=ro', uri=True)
cursor = connection.cursor()

In [43]:
pert_ids = ['BRD-K68741898', 'BRD-A05457250']

query = """
SELECT pert_uid, pert_id FROM perts
WHERE pert_id IN ({})
""".format(','.join('?' * len(pert_ids)))

pandas.read_sql(query, connection, params=pert_ids)

Unnamed: 0,pert_uid,pert_id
0,13,BRD-A05457250
1,12,BRD-K68741898


In [51]:
pert_ids = ['BRD-K68741898', 'BRD-A05457250']

query = """
SELECT * FROM (
    SELECT pert_uid, pert_id FROM perts
    WHERE pert_id IN ({})
) INNER JOIN similarities ON pert_uid = similarities.pert_uid_0;
""".format(','.join('?' * len(pert_ids)))

pandas.read_sql(query, connection, params=pert_ids)

Unnamed: 0,pert_uid,pert_id,pert_uid_0,pert_uid_1,chemical
0,13,BRD-A05457250,13,b'\x00\x01\x00\x00\x00\x00\x00\x00',0.2794
1,13,BRD-A05457250,13,b'\x00\x05\x00\x00\x00\x00\x00\x00',0.1830
2,13,BRD-A05457250,13,b'\x00\n\x00\x00\x00\x00\x00\x00',0.2957
3,13,BRD-A05457250,13,b'\x00\x1c\x00\x00\x00\x00\x00\x00',0.1676
4,13,BRD-A05457250,13,b'\x00 \x00\x00\x00\x00\x00\x00',0.1386
5,13,BRD-A05457250,13,b'\x00!\x00\x00\x00\x00\x00\x00',0.2703
6,13,BRD-A05457250,13,"b'\x00""\x00\x00\x00\x00\x00\x00'",0.2016
7,13,BRD-A05457250,13,b'\x00#\x00\x00\x00\x00\x00\x00',0.1739
8,13,BRD-A05457250,13,b'\x00$\x00\x00\x00\x00\x00\x00',0.2584
9,13,BRD-A05457250,13,b'\x00%\x00\x00\x00\x00\x00\x00',0.3115


In [54]:
pert_ids = ['BRD-K68741898', 'BRD-A05457250']

query = """
SELECT * FROM similarities LIMIT 5;
"""
pandas.read_sql(query, connection)

Unnamed: 0,pert_uid_0,pert_uid_1,chemical
0,39929,b'\xf9\x9b\x00\x00\x00\x00\x00\x00',1.0
1,39929,b'1&\x00\x00\x00\x00\x00\x00',0.2689
2,39929,b'\x93\x08\x00\x00\x00\x00\x00\x00',0.1324
3,39929,b'\xd6\xb8\x00\x00\x00\x00\x00\x00',0.2078
4,39929,b'A\xc5\x00\x00\x00\x00\x00\x00',0.2078
