# Positive Control Compound

In this notebook, we try to use existing database to find compounds which have been tested on U2OS cell-line.

## CCLE Database

I will use sheet_11 of the [Excel file](https://www.nature.com/articles/nature11003#supplementary-information) to find positive compounds.

In [10]:
from rdkit import Chem
import pandas as pd
import numpy as np
import requests

In [4]:
all_df = pd.read_csv('./resource/ccle_drug.csv')

In [5]:
u2os_index = [True if i == 'U2OS_BONE' else False for i in
              all_df['CCLE Cell Line Name']]
np.sum(u2os_index)

24

There are only 24 drugs testing on U2OS on this Excel file.

In [7]:
ccle_overlapping_df = all_df.iloc[u2os_index,:]
ccle_overlapping_df

Unnamed: 0,CCLE Cell Line Name,Primary Cell Line Name,Compound,Target,Doses (uM),Activity Data (median),Activity SD,Num Data,FitType,EC50 (µM),IC50 (µM),Amax,ActArea
484,U2OS_BONE,U-2 OS,17-AAG,HSP90,".0025,.0080,.025,.080,.25,.80,2.53,8","-4.4,-13,-23,-39,-74,-91,-92,-92","16.2,7.93,.26,14.8,6.43,1.34,1.08,.64",8,Sigmoid,0.107041,0.099057,-93.582008,4.6678
987,U2OS_BONE,U-2 OS,AEW541,IGF1R,".0025,.0080,.025,.080,.25,.80,2.53,8","4.08,14.3,4.61,-2.5,-14,-13,-44,-78","5.12,17.2,7.62,3.22,1.63,4.39,4.94,14.3",8,Sigmoid,8.763501,3.435356,-78.528435,1.5148
1491,U2OS_BONE,U-2 OS,AZD0530,ABL,".0025,.0080,.025,.080,.25,.80,2.53,8","-2.2,3.02,-.14,-3.8,-3.3,-23,-30,-41","7.72,14.6,15.9,.24,.33,5.58,5.49,14.1",8,Sigmoid,0.76691,8.0,-37.888985,1.3457
1994,U2OS_BONE,U-2 OS,AZD6244,MEK,".0025,.0080,.025,.080,.25,.80,2.53,8","4.15,-3.2,-.014,-20,-16,-26,-18,-29",".64,21.7,8.42,18.4,1.37,11.0,.21,28.7",8,Constant,,8.0,-16.204763,1.2003
2497,U2OS_BONE,U-2 OS,Erlotinib,EGFR,".0025,.0080,.025,.080,.25,.80,2.53,8","2.79,5.16,2.32,2.76,-8.9,-10,-5.4,-28","1.18,15.3,5.45,6.35,5.14,.31,6.00,17.9",8,Sigmoid,8.832891,8.0,-25.136881,0.4888
2820,U2OS_BONE,U-2 OS,Irinotecan,TOP2,".0025,.0080,.025,.080,.25,.80,2.53,8",".75,-3.3,1.35,-13,-14,-63,-78,-90","5.23,8.88,3.48,1.28,2.34,19.3,16.8,7.24",8,Sigmoid,0.591141,0.658781,-88.524544,2.6425
3306,U2OS_BONE,U-2 OS,L-685458,GS,".0025,.0080,.025,.080,.25,.80,2.53,8","2.36,6.79,8.80,9.54,10.7,-4.7,-35,-70","1.31,10.5,6.99,.30,.29,3.00,44.0,26.6",8,Sigmoid,1.068852,1.558048,-58.775295,1.163
3808,U2OS_BONE,U-2 OS,LBW242,XIAP,".0025,.0080,.025,.080,.25,.80,2.53,8","-19,-5.9,-2.0,4.67,6.56,10.5,-1.1,1.62","8.68,.37,9.34,18.6,9.57,18.0,14.3,3.54",8,Constant,,8.0,-1.130814,0.3309
4312,U2OS_BONE,U-2 OS,Lapatinib,EGFR,".0025,.0080,.025,.080,.25,.80,2.53,8","2.47,7.06,6.19,-8.6,-4.3,-27,-17,-23","4.85,1.92,8.03,6.40,6.62,6.04,10.5,2.70",8,Sigmoid,0.190182,8.0,-22.993229,0.7341
4737,U2OS_BONE,U-2 OS,Nilotinib,ABL,".0025,.0080,.025,.080,.25,.80,2.53,8","9.76,8.24,17.3,9.59,25.8,12.2,23.8,-69","4.43,2.10,9.11,10.4,8.32,1.48,13.4,19.1",8,Sigmoid,8.163845,7.420892,-69.161781,0.6846


Try to use clue.io API to find Broad ID for these compounds. Unfortunately, the clue.io API doesn't support use compound synonym to find their `pert_iname` (drug name). Instead, I will use SMILEs to match these drugs

In [13]:
# c1c655d96dfc7a422a2f2025bf290261

## Sanger Database

In [24]:
df = pd.read_csv('./resource/sanger_db.csv')
print(df.shape)
df.head()

(196, 5)


Unnamed: 0,ID,Drug Name,Targets,Z Score,Count
0,1133,Serdemetan,MDM2,-1.49865,196
1,154,CHIR-99021,"GSK3A, GSK3B",-1.485962,196
2,179,5-Fluorouracil,Antimetabolite (DNA & RNA),-1.21692,196
3,1024,Lestaurtinib,"FLT3, JAK2, NTRK1, NTRK2, NTRK3",-1.152742,196
4,1175,Rucaparib,"PARP1, PARP2",-1.142966,196


In [22]:
# Use pubchem API to convert drug name into canonical smiles
smiles_list = ['' for i in range(df.shape[0])]
get_url = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/"
           "{}/property/CanonicalSMILES/TXT")

for i, r in df.iterrows():
    cpd_name = r['Drug Name']
    r = requests.get(get_url.format(cpd_name))
    smiles = r.text.encode("utf-8").replace('\n', '')
    smiles_list[i] = smiles

In [26]:
# Find cpds that pubchem fails to find smiles
failed_index = []
for i in range(len(smiles_list)):
    if '404' in smiles_list[i]:
        failed_index.append(i)

In [35]:
# Manually get the SMILES for those cpds
for f in failed_index:
    smiles_list[f] = raw_input(df['Drug Name'][f])

QL-XII-61
Bleomycin (50 uM)CC1=C(N=C(N=C1N)C(CC(=O)N)NCC(C(=O)N)N)C(=O)NC(C(C2=CN=CN2)OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(C(O4)CO)O)OC(=O)N)O)C(=O)NC(C)C(C(C)C(=O)NC(C(C)O)C(=O)NCCC5=NC(=CS5)C6=NC(=CS6)C(=O)NCCC[S+](C)C)O
HG-5-113-01CC(C)S(=O)(=O)C1=CC=CC=C1NC2=CC(=NC3=C2C=CN3)NC4=NC=C(C=C4)C(=O)NC
Cetuximab
Cisplatin
XMD15-27
HG-5-88-01CC(C)S(=O)(=O)C1=CC=CC=C1NC2=NC(=NC(=C2Cl)N)NC3=C(C=C(C=C3)N4CCC(CC4)C(=O)N)OC
ZG-10O=C(C1=CC=C(NC(/C=C/CN(C)C)=O)C=C1)NC2=CC(NC3=NC(C4=CC=CN=C4)=CC=N3)=CC=C2
rTRAIL
ICL1100013
XMD14-99CN1CCN(CC1)C2=CC(=C(C=C2)NC3=NC(=C(S3)C(=O)C4=C(C=CC=C4Cl)Cl)N)OC
VNLG/124
XMD13-2O=C(C1CC1)NC2=NNC3=CC(C4=CC(C(NC5CC5)=O)=CC=C4)=CC=C32
JQ12
QL-VIII-58
TL-2-105
QL-XI-92CC1=C(C=C(C=C1)C(=O)NC2=CC(=CC(=C2)C(F)(F)F)NC(=O)C=C)NC(=O)C3=CC=NO3
XMD11-85hO=C1CCN(C2CCCC2)C3=NC(NC4=CC=C(C(NC5CCN(C)CC5)=O)C=C4OC)=NC=C3N1C(C)C
KIN001-270CC1=C(C=C(C=C1)NC2=NC=NC(=C2)C3=CC(=CC=C3)N4C(=O)C5=CC=CC=C5C4=O)NS(=O)(=O)C
KIN001-236
JW-7-24-1COC1=CC(=CC(=C1)C2=CC3=C4C(=CN=C3C=C2)C=CC(=O)N4C5=CC(=C

In [38]:
len([i for i in smiles_list if i == ''])

16

There are still 16 compounds that I couldn't find SMILES.

Since some SMILES come from Harvard's LINCS library, I will re-produce the canonical SMILES for all SMILES.

In [44]:
canonical_smiles_list = []

for s in smiles_list:
    if s == '':
        canonical_smiles_list.append(s)
    else:
        molecule = Chem.MolFromSmiles(s)
        if molecule is None:
            print(s)
            canonical_smiles_list.append('')
            continue
        canonical_smiles = Chem.MolToSmiles(molecule)
        canonical_smiles_list.append(canonical_smiles)

CN1C=C(C2=CC=CC=C21)C=C3C4=C(C=CC=N4)NC3=OCN1C=C(C2=CC=CC=C21)C=C3C4=C(C=CC=N4)NC3=O.Cl


In [60]:
# Find the intersect with cpds used in our image dataset
image_df = pd.read_csv('./data/test/meta_data/chemical_annotations_inchi.csv')
image_df.head()

Unnamed: 0,BROAD_ID,CPD_NAME,CPD_NAME_TYPE,CPD_SAMPLE_ID,DOS_LIBRARY,SOURCE_NAME,CHEMIST_NAME,VENDOR_CATALOG_ID,CPD_SMILES,USERCOMMENT,CPD_CANONICAL_SMILES,INCHI
0,BRD-A56675431-001-04-0,altizide,INN,SA82748,,Prestwick Chemical Inc.,,Prestw-721,NS(=O)(=O)c1cc2c(NC(CSCC=C)NS2(=O)=O)cc1Cl,,C=CCSCC1Nc2cc(Cl)c(S(N)(=O)=O)cc2S(=O)(=O)N1,InChI=1S/C11H14ClN3O4S3/c1-2-3-20-6-11-14-8-4-...
1,BRD-A51829654-001-01-4,"BRL-15,572",common,SA82481,,Biomol International Inc.,,AC-536,OC(CN1CCN(CC1)c1cccc(Cl)c1)C(c1ccccc1)c1ccccc1,,OC(CN1CCN(c2cccc(Cl)c2)CC1)C(c1ccccc1)c1ccccc1,InChI=1S/C25H27ClN2O/c26-22-12-7-13-23(18-22)2...
2,BRD-K04046242-001-03-6,equilin,primary-common,SA82922,,Prestwick Chemical Inc.,,Prestw-850,C[C@]12CC[C@H]3C(=CCc4cc(O)ccc34)[C@@H]1CCC2=O,,CC12CCC3C(=CCc4cc(O)ccc43)C1CCC2=O,InChI=1S/C18H20O2/c1-18-9-8-14-13-5-3-12(19)10...
3,BRD-K16508793-001-01-8,diazepam,INN,SA59660,,MicroSource Discovery Systems Inc.,,1900003,CN1c2ccc(Cl)cc2C(=NCC1=O)c1ccccc1,,CN1C(=O)CN=C(c2ccccc2)c2cc(Cl)ccc21,InChI=1S/C16H13ClN2O/c1-19-14-8-7-12(17)9-13(1...
4,BRD-K09397065-001-01-6,SR 57227A,to-be-curated,SA82504,,Biomol International Inc.,,AC-561,NC1CCN(CC1)c1cccc(Cl)n1,,NC1CCN(c2cccc(Cl)n2)CC1,InChI=1S/C10H14ClN3/c11-9-2-1-3-10(13-9)14-6-4...


In [52]:
image_smiles = image_df['CPD_CANONICAL_SMILES']

In [54]:
overlapping_1 = set(canonical_smiles_list).intersection(image_smiles)
overlapping_1

{'C=CCc1cccc(C=NNC(=O)CN2CCN(Cc3ccccc3)CC2)c1O',
 'CCC(=C(c1ccccc1)c1ccc(OCCN(C)C)cc1)c1ccccc1',
 'CCc1cc(-c2n[nH]c(C)c2-c2ccc3c(c2)OCCO3)c(O)cc1O',
 'CN(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)NC(CCC(=O)O)C(=O)O)cc1',
 'Cn1cc(C2=C(c3ccc(Cl)cc3Cl)C(=O)NC2=O)c2ccccc21',
 'NC(=O)Nc1sc(-c2ccc(F)cc2)cc1C(N)=O',
 'O=S(=O)(c1ccccc1)N(CC(F)(F)F)c1ccc(C(O)(C(F)(F)F)C(F)(F)F)cc1'}

There are only 7 overlapping compounds. We can try to intersect using InChi instead of SMILES.

In [55]:
# Use pubchem API to convert drug name into canonical smiles
inchi_list = ['' for i in range(df.shape[0])]
get_url = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/"
           "{}/property/InChI/TXT")

for i, r in df.iterrows():
    cpd_name = r['Drug Name']
    r = requests.get(get_url.format(cpd_name))
    inchi = r.text.encode("utf-8").replace('\n', '')
    inchi_list[i] = inchi

In [65]:
# Manually get the SMILES for those cpds
for f in failed_index:
    inchi_list[f] = raw_input("{}: ".format(df['Drug Name'][f]))

QL-XII-61: 
Bleomycin (50 uM): InChI=1S/C55H83N17O21S3/c1-20-33(69-46(72-44(20)58)25(12-31(57)76)64-13-24(56)45(59)82)50(86)71-35(41(26-14-61-19-65-26)91-54-43(39(80)37(78)29(15-73)90-54)92-53-40(81)42(93-55(60)88)38(79)30(16-74)89-53)51(87)66-22(3)36(77)21(2)47(83)70-34(23(4)75)49(85)63-10-8-32-67-28(18-94-32)52-68-27(17-95-52)48(84)62-9-7-11-96(5)6/h14,17-19,21-25,29-30,34-43,53-54,64,73-75,77-81H,7-13,15-16,56H2,1-6H3,(H13-,57,58,59,60,61,62,63,65,66,69,70,71,72,76,82,83,84,85,86,87,88)/p+1/t21-,22+,23+,24-,25-,29-,30+,34-,35-,36-,37+,38+,39-,40-,41-,42-,43-,53+,54-/m0/s1
HG-5-113-01: InChI=1S/C23H24N6O3S/c1-14(2)33(31,32)19-7-5-4-6-17(19)27-18-12-21(29-22-16(18)10-11-25-22)28-20-9-8-15(13-26-20)23(30)24-3/h4-14H,1-3H3,(H,24,30)(H3,25,26,27,28,29)
Cetuximab: 
Cisplatin: 
XMD15-27: 
HG-5-88-01: InChI=1S/C26H32ClN7O4S/c1-15(2)39(36,37)21-7-5-4-6-19(21)30-25-22(27)23(28)32-26(33-25)31-18-9-8-17(14-20(18)38-3)34-12-10-16(11-13-34)24(29)35/h4-9,14-16H,10-13H2,1-3H3,(H2,29,35)(H4,28,30,31

In [69]:
# Convert inchi again
converted_inchi_list = []

for i in inchi_list:
    if i == '':
        converted_inchi_list.append(i)
    else:
        molecule = Chem.MolFromInchi(i)
        if molecule is None:
            print('Failed to read inchi')
            converted_inchi_list.append(i)
            continue
        converted_inchi = Chem.MolToInchi(molecule)
        converted_inchi_list.append(converted_inchi)

Failed to read inchi
Failed to read inchi
Failed to read inchi
Failed to read inchi
Failed to read inchi
Failed to read inchi
Failed to read inchi
Failed to read inchi
Failed to read inchi
Failed to read inchi


In [71]:
# Find the intersect with cpds used in our image dataset
overlapping_inchi = set(converted_inchi_list).intersection(set(image_df['INCHI']))
overlapping_inchi

{'InChI=1S/C12H10FN3O2S/c13-7-3-1-6(2-4-7)9-5-8(10(14)17)11(19-9)16-12(15)18/h1-5H,(H2,14,17)(H3,15,16,18)',
 'InChI=1S/C15H14N2O4S/c18-15(16-19)10-9-12-5-4-8-14(11-12)22(20,21)17-13-6-2-1-3-7-13/h1-11,17,19H,(H,16,18)/b10-9+',
 'InChI=1S/C17H12F9NO3S/c18-14(19,20)10-27(31(29,30)13-4-2-1-3-5-13)12-8-6-11(7-9-12)15(28,16(21,22)23)17(24,25)26/h1-9,28H,10H2',
 'InChI=1S/C17H19NO5/c1-21-13-10-12(11-14(22-2)17(13)23-3)7-8-16(20)18-9-5-4-6-15(18)19/h4,6-8,10-11H,5,9H2,1-3H3/b8-7+',
 'InChI=1S/C19H12Cl2N2O2/c1-23-9-13(11-4-2-3-5-15(11)23)17-16(18(24)22-19(17)25)12-7-6-10(20)8-14(12)21/h2-9H,1H3,(H,22,24,25)',
 'InChI=1S/C20H20N2O4/c1-3-12-8-14(16(24)10-15(12)23)20-19(11(2)21-22-20)13-4-5-17-18(9-13)26-7-6-25-17/h4-5,8-10,23-24H,3,6-7H2,1-2H3,(H,21,22)',
 'InChI=1S/C26H29NO/c1-4-25(21-11-7-5-8-12-21)26(22-13-9-6-10-14-22)23-15-17-24(18-16-23)28-20-19-27(2)3/h5-18H,4,19-20H2,1-3H3/b26-25-',
 'InChI=1S/C27H29NO11/c1-10-22(31)13(28)6-17(38-10)39-15-8-27(36,16(30)9-29)7-12-19(15)26(35)21-20(24(12)

Here we have 11 overlapping compounds.

In [96]:
# Find associated scores for these inchi
inchi_index = []
for i in range(len(converted_inchi_list)):
    if converted_inchi_list[i] in overlapping_inchi:
        inchi_index.append(i)
inchi_index

[7, 20, 26, 29, 69, 78, 133, 172, 182, 186]

In [98]:
inchi_score_dict = {}

for i in inchi_index:
    inchi_score_dict[converted_inchi_list[i]] = df['Z Score'][i]
    
inchi_score_dict

{'InChI=1S/C12H10FN3O2S/c13-7-3-1-6(2-4-7)9-5-8(10(14)17)11(19-9)16-12(15)18/h1-5H,(H2,14,17)(H3,15,16,18)': 1.4170812991639807,
 'InChI=1S/C15H14N2O4S/c18-15(16-19)10-9-12-5-4-8-14(11-12)22(20,21)17-13-6-2-1-3-7-13/h1-11,17,19H,(H,16,18)/b10-9+': -0.95548114460381517,
 'InChI=1S/C17H12F9NO3S/c18-14(19,20)10-27(31(29,30)13-4-2-1-3-5-13)12-8-6-11(7-9-12)15(28,16(21,22)23)17(24,25)26/h1-9,28H,10H2': 0.038028979264701737,
 'InChI=1S/C17H19NO5/c1-21-13-10-12(11-14(22-2)17(13)23-3)7-8-16(20)18-9-5-4-6-15(18)19/h4,6-8,10-11H,5,9H2,1-3H3/b8-7+': -0.59774322856227036,
 'InChI=1S/C19H12Cl2N2O2/c1-23-9-13(11-4-2-3-5-15(11)23)17-16(18(24)22-19(17)25)12-7-6-10(20)8-14(12)21/h2-9H,1H3,(H,22,24,25)': -0.63420497314175805,
 'InChI=1S/C20H20N2O4/c1-3-12-8-14(16(24)10-15(12)23)20-19(11(2)21-22-20)13-4-5-17-18(9-13)26-7-6-25-17/h4-5,8-10,23-24H,3,6-7H2,1-2H3,(H,21,22)': -0.69909760833211643,
 'InChI=1S/C26H29NO/c1-4-25(21-11-7-5-8-12-21)26(22-13-9-6-10-14-22)23-15-17-24(18-16-23)28-20-19-27(2)3/h5-18H,4

In [101]:
overlapping_index = []
overlapping_score = []
for i in range(len(image_df['INCHI'])):
    if image_df['INCHI'][i] in overlapping_inchi:
        overlapping_index.append(i)
        overlapping_score.append(inchi_score_dict[image_df['INCHI'][i]])

In [76]:
image_df.iloc[overlapping_index, :]

Unnamed: 0,BROAD_ID,CPD_NAME,CPD_NAME_TYPE,CPD_SAMPLE_ID,DOS_LIBRARY,SOURCE_NAME,CHEMIST_NAME,VENDOR_CATALOG_ID,CPD_SMILES,USERCOMMENT,CPD_CANONICAL_SMILES,INCHI
54,BRD-K17743125-001-01-9,belinostat,USAN,SA68090,,Broad Institute CMLD,,,ONC(=O)\C=C\c1cccc(c1)S(=O)(=O)Nc1ccccc1,,O=C(C=Cc1cccc(S(=O)(=O)Nc2ccccc2)c1)NO,InChI=1S/C15H14N2O4S/c18-15(16-19)10-9-12-5-4-...
230,BRD-K93754473-001-02-9,tamoxifen,INN,SA82346,,Biomol International Inc.,,S-650,CC\C(c1ccccc1)=C(/c1ccccc1)c1ccc(OCCN(C)C)cc1,,CCC(=C(c1ccccc1)c1ccc(OCCN(C)C)cc1)c1ccccc1,InChI=1S/C26H29NO/c1-4-25(21-11-7-5-8-12-21)26...
314,BRD-K92093830-003-05-0,doxorubicin,INN,SA82891,,Prestwick Chemical Inc.,,Prestw-438,COc1cccc2C(=O)c3c(O)c4C[C@](O)(C[C@H](O[C@H]5C...,,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)CC(O)(C(=...,InChI=1S/C27H29NO11/c1-10-22(31)13(28)6-17(38-...
1542,BRD-K65503129-001-01-4,CCT018159,primary-common,SA792853,,Tocris Bioscience,,2435,CCc1cc(-c2n[nH]c(C)c2-c2ccc3OCCOc3c2)c(O)cc1O,,CCc1cc(-c2n[nH]c(C)c2-c2ccc3c(c2)OCCO3)c(O)cc1O,InChI=1S/C20H20N2O4/c1-3-12-8-14(16(24)10-15(1...
1545,BRD-K69023402-001-02-5,thapsigargin,common,SA82112,,Biomol International Inc.,,PE-180,CCCCCCCC(=O)O[C@@H]1[C@@H](OC(=O)C(\C)=C/C)C(C...,,CC=C(C)C(=O)OC1C(C)=C2C(C1OC(=O)CCCCCCC)C(C)(O...,InChI=1S/C34H50O12/c1-9-12-13-14-15-17-24(37)4...
1575,BRD-K24132293-001-02-0,piperlongumine,primary-common,SA37589,,AnalytiCon Discovery GmbH,,NP-003716,COc1cc(cc(OC)c1OC)\C=C\C(=O)N1CCC=CC1=O,,COc1cc(C=CC(=O)N2CCC=CC2=O)cc(OC)c1OC,InChI=1S/C17H19NO5/c1-21-13-10-12(11-14(22-2)1...
1614,BRD-K23383398-001-01-6,T-0901317,primary-common,SA792830,,Tocris Bioscience,,2373,OC(c1ccc(cc1)N(CC(F)(F)F)S(=O)(=O)c1ccccc1)(C(...,,O=S(=O)(c1ccccc1)N(CC(F)(F)F)c1ccc(C(O)(C(F)(F...,"InChI=1S/C17H12F9NO3S/c18-14(19,20)10-27(31(29..."
1653,BRD-K37798499-001-02-5,etoposide,INN,SA82914,,Prestwick Chemical Inc.,,Prestw-396,COc1cc(cc(OC)c1O)[C@H]1[C@@H]2[C@H](COC2=O)[C@...,,COc1cc(C2c3cc4c(cc3C(OC3OC5COC(C)OC5C(O)C3O)C3...,InChI=1S/C29H32O13/c1-11-36-9-20-27(40-11)24(3...
1873,BRD-K51575138-001-01-7,TPCA-1,primary-common,SA792920,,Tocris Bioscience,,2559,NC(=O)Nc1sc(cc1C(N)=O)-c1ccc(F)cc1,,NC(=O)Nc1sc(-c2ccc(F)cc2)cc1C(N)=O,InChI=1S/C12H10FN3O2S/c13-7-3-1-6(2-4-7)9-5-8(...
2233,BRD-K59184148-001-04-2,SB-216763,primary-common,SA792563,,Tocris Bioscience,,1616,Cn1cc(C2=C(C(=O)NC2=O)c2ccc(Cl)cc2Cl)c2ccccc12,,Cn1cc(C2=C(c3ccc(Cl)cc3Cl)C(=O)NC2=O)c2ccccc21,InChI=1S/C19H12Cl2N2O2/c1-23-9-13(11-4-2-3-5-1...


In [103]:
overlapping_bid = image_df.iloc[overlapping_index, :]['BROAD_ID'].tolist()
overlapping_bid_dict = dict(zip(overlapping_bid, overlapping_score))
overlapping_bid_dict

{'BRD-K17743125-001-01-9': -0.95548114460381517,
 'BRD-K23383398-001-01-6': 0.038028979264701737,
 'BRD-K24132293-001-02-0': -0.59774322856227036,
 'BRD-K37798499-001-02-5': 1.7845468967387552,
 'BRD-K51575138-001-01-7': 1.4170812991639807,
 'BRD-K59184148-001-04-2': -0.63420497314175805,
 'BRD-K59184148-001-10-9': -0.63420497314175805,
 'BRD-K65503129-001-01-4': -0.69909760833211643,
 'BRD-K69023402-001-02-5': 0.18927683315036906,
 'BRD-K92093830-003-05-0': 1.919535291573689,
 'BRD-K93754473-001-02-9': 0.83160357303321086}

## Find Corresponding wells

Now, we have the overlapping bids. We can trace back the wells and plates where these compounds were tested.

In [82]:
plate_table = pd.read_csv('./resource/merged_table_406.csv')

In [105]:
bids = plate_table['Metadata_broad_sample'].tolist()
plate_table_overlapping = {}
plate_table_index = []

for b in range(len(bids)):
    if bids[b] in overlapping_bid_dict:
        plate_table_index.append(b)
        if bids[b] not in plate_table_overlapping:
            plate_table_overlapping[bids[b]] = [b]
        else:
            plate_table_overlapping[bids[b]].append(b)
        
print(len(plate_table_index))

84


There are 84 wells having these compounds.

In [114]:
pids = plate_table['Metadata_Plate'][plate_table_index]
wids = plate_table['Metadata_Well'][plate_table_index]

pairs = []
pw_dict = {}
for k in plate_table_overlapping:
    for i in plate_table_overlapping[k]:
        pw_dict[(pids[i], wids[i])] = overlapping_bid_dict[k]
        pairs.append((pids[i], wids[i]))
        
pw_dict

{(24277, 'c24'): 0.038028979264701737,
 (24277, 'd13'): -0.59774322856227036,
 (24277, 'o08'): 1.7845468967387552,
 (24278, 'k04'): 1.919535291573689,
 (24278, 'l21'): 0.83160357303321086,
 (24279, 'k04'): 1.919535291573689,
 (24279, 'l21'): 0.83160357303321086,
 (24280, 'k04'): 1.919535291573689,
 (24280, 'l21'): 0.83160357303321086,
 (24293, 'k04'): 1.919535291573689,
 (24293, 'l21'): 0.83160357303321086,
 (24296, 'c24'): 0.038028979264701737,
 (24296, 'd13'): -0.59774322856227036,
 (24296, 'o08'): 1.7845468967387552,
 (24305, 'p14'): -0.95548114460381517,
 (24306, 'p14'): -0.95548114460381517,
 (24307, 'p14'): -0.95548114460381517,
 (24308, 'c24'): 0.038028979264701737,
 (24308, 'd13'): -0.59774322856227036,
 (24308, 'o08'): 1.7845468967387552,
 (24309, 'c24'): 0.038028979264701737,
 (24309, 'd13'): -0.59774322856227036,
 (24309, 'o08'): 1.7845468967387552,
 (24310, 'i23'): 1.4170812991639807,
 (24310, 'o15'): -0.63420497314175805,
 (24311, 'i23'): 1.4170812991639807,
 (24311, 'o15'

In [115]:
with open('args.txt', 'w') as fp:
    for p in set(pairs):
        for sid in [1, 2, 3, 4, 5, 6]:
            fp.write("{},{},{},{}\n".format(p[0], p[1], sid, 1))

In [123]:
temp_dict = {}
for k in pw_dict:
    cw = '{}_{}'.format(k[0], k[1])
    temp_dict[cw] = pw_dict[k]

In [120]:
pre_norm = pd.read_csv('./resource/umap_combined_feature_406.csv')

In [126]:
pre_norm_index = []
for i in range(len(pre_norm['label'])):
    if pre_norm['label'][i] in temp_dict:
        pre_norm_index.append(i)




In [128]:
len(pre_norm_index)

458

In [135]:
pre_norm_overlap = pre_norm.iloc[pre_norm_index,:].copy()

In [136]:
cor_scores = []
for i, r in pre_norm_overlap.iterrows():
    cor_scores.append(temp_dict[r['label']])
pre_norm_overlap['score'] = cor_scores

In [138]:
pre_norm_overlap.to_csv('pre_nrom_overlap.csv', index=False)

In [140]:
post_norm = pd.read_csv('./resource/umap_normed_features.csv')

In [141]:
post_norm_index = []
for i in range(len(pre_norm['label'])):
    if post_norm['label'][i] in temp_dict:
        post_norm_index.append(i)
len(pre_norm_index)

458

In [142]:
post_norm_overlap = post_norm.iloc[post_norm_index,:].copy()
cor_scores = []
for i, r in post_norm_overlap.iterrows():
    cor_scores.append(temp_dict[r['label']])
post_norm_overlap['score'] = cor_scores

In [143]:
post_norm_overlap.to_csv('post_nrom_overlap.csv', index=False)

In [144]:
post_norm_overlap

Unnamed: 0,x,y,label,label_2,score
9077,2.148799,-3.753407,24305_p14,5,-0.955481
9078,2.353336,-3.621856,24305_p14,4,-0.955481
9079,2.571708,-3.003010,24305_p14,3,-0.955481
9080,1.858499,-4.042998,24305_p14,1,-0.955481
9081,1.394501,-4.192612,24305_p14,6,-0.955481
9082,2.024525,-3.983737,24305_p14,2,-0.955481
28255,1.575198,-3.963285,24306_p14,4,-0.955481
28256,2.843200,-2.593352,24306_p14,2,-0.955481
28257,2.320187,-0.740441,24306_p14,6,-0.955481
28258,2.330403,-3.323104,24306_p14,3,-0.955481


In [148]:
np.max(cor_scores)

1.919535291573689