In [None]:
import rdkit
import rdkit.Chem
import rdkit.Chem.inchi
import rdkit.Chem.AllChem
import rdkit.DataStructs
import pandas
import sqlite3

In [None]:
# Establish database connection
connection = sqlite3.connect('data/l1000.db')
cursor = connection.cursor()

In [None]:
query = """
SELECT * FROM perts
WHERE pert_type == 'trt_cp'
AND inchi_string NOTNULL;
"""
pert_df = pandas.read_sql(query, connection)

In [None]:
pert_df.head()

In [None]:
rows = list()

for i, series in pert_df.iterrows():
    # check InChI Keys
    inchi = series.inchi_string
    inchi_key = rdkit.Chem.inchi.InchiToInchiKey(inchi)
    assert inchi_key == series.inchi_key
    
    # molecule
    mol = rdkit.Chem.MolFromInchi(inchi)
    
    # fingerprint
    fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2)
    
    rows.append([series.pert_uid, inchi_key, inchi, mol, fingerprint])

inchi_df = pandas.DataFrame(rows, columns=['pert_uid', 'inchi_key', 'inchi_string', 'mol', 'fingerprint'])
inchi_df.sort('inchi_key', inplace=True)

In [None]:
inchi_df.head()

In [None]:
# # Delete similarities table
# cursor.execute('DELETE FROM similarities;')
# connection.commit()

In [None]:
pert_uids = list(inchi_df.pert_uid)
fingerprints = list(inchi_df.fingerprint)

for i, series in inchi_df.iterrows():
    print(series.inchi_key, '\r', end='')
    similarities = rdkit.DataStructs.BulkDiceSimilarity(series.fingerprint, fingerprints)
    similarities = [round(x, 4) for x in similarities]
    rows = ((series.pert_uid, pert_uid, similarity) for pert_uid, similarity in zip(pert_uids, similarities))
    cursor.executemany('INSERT INTO similarities VALUES (?,?,?)', rows)
    connection.commit()

In [None]:
# close database connection
connection.close()

## Example usage

In [None]:
connection = sqlite3.connect('file:data/l1000.db?mode=ro', uri=True)
cursor = connection.cursor()

In [57]:
pert_ids = ['BRD-K68741898', 'BRD-A05457250']

query = """
SELECT pert_uid, pert_id FROM perts
WHERE pert_id IN ({})
""".format(','.join('?' * len(pert_ids)))

pandas.read_sql(query, connection, params=pert_ids)

Unnamed: 0,pert_uid,pert_id
0,13,BRD-A05457250
1,12,BRD-K68741898


In [59]:
pert_ids = ['BRD-K68741898', 'BRD-A05457250']

query = """
SELECT * FROM (
    SELECT pert_uid, pert_id FROM perts
    WHERE pert_id IN ({})
) INNER JOIN similarities ON pert_uid = similarities.pert_uid_0;
""".format(','.join('?' * len(pert_ids)))

pandas.read_sql(query, connection, params=pert_ids).head()

Unnamed: 0,pert_uid,pert_id,pert_uid_0,pert_uid_1,chemical
0,13,BRD-A05457250,13,b'\x00\x01\x00\x00\x00\x00\x00\x00',0.2794
1,13,BRD-A05457250,13,b'\x00\x05\x00\x00\x00\x00\x00\x00',0.183
2,13,BRD-A05457250,13,b'\x00\n\x00\x00\x00\x00\x00\x00',0.2957
3,13,BRD-A05457250,13,b'\x00\x1c\x00\x00\x00\x00\x00\x00',0.1676
4,13,BRD-A05457250,13,b'\x00 \x00\x00\x00\x00\x00\x00',0.1386


In [61]:
pandas.read_sql('pragma table_info(similarities)', connection)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,pert_uid_0,INTEGER,1,,1
1,1,pert_uid_1,INTEGER,1,,2
2,2,chemical,REAL,0,,0


In [62]:
query = """
SELECT * FROM similarities LIMIT 5;
"""
pandas.read_sql(query, connection)

Unnamed: 0,pert_uid_0,pert_uid_1,chemical
0,39929,b'\xf9\x9b\x00\x00\x00\x00\x00\x00',1.0
1,39929,b'1&\x00\x00\x00\x00\x00\x00',0.2689
2,39929,b'\x93\x08\x00\x00\x00\x00\x00\x00',0.1324
3,39929,b'\xd6\xb8\x00\x00\x00\x00\x00\x00',0.2078
4,39929,b'A\xc5\x00\x00\x00\x00\x00\x00',0.2078


UnicodeDecodeError: 'ascii' codec can't decode byte 0xf9 in position 0: ordinal not in range(128)