In [21]:
import pandas as pd

# Load your data
our_df = pd.read_csv('../input/csv/result.csv')
test_df = pd.read_csv('../../BindingDB_singlechain_deduplicated.csv')

In [22]:
# Remove duplicates in both dataframes
our_df = our_df.drop_duplicates()
test_df = test_df.drop_duplicates()

In [23]:
# Choose the columns for comparison
compare_cols = ['Ligand InChI Key', 'Sequence']

# Merge by (Ligand InChI Key, Sequence), keeping only values from our_df, and bringing in etalon values from test_df
merged = pd.merge(
    our_df,
    test_df[compare_cols + ['Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)']],
    on=compare_cols,
    how='left',
    suffixes=('', '_ref')
)

In [24]:
# After running the previous merge:
filtered_ki = merged[
    merged['Ki (nM)'].notnull() & merged['Ki (nM)_ref'].notnull()
]

filtered_ic = merged[
    merged['IC50 (nM)'].notnull() & merged['IC50 (nM)_ref'].notnull()
]

filtered_ec = merged[
    merged['EC50 (nM)'].notnull() & merged['EC50 (nM)_ref'].notnull()
]

filtered_kd = merged[
    merged['Kd (nM)'].notnull() & merged['Kd (nM)_ref'].notnull()
]


In [25]:
# Reorder columns: group each measure with its ref
ordered_cols = [
    'Ligand InChI Key', 'Sequence',
    'Ki (nM)', 'Ki (nM)_ref',
    'IC50 (nM)', 'IC50 (nM)_ref',
    'Kd (nM)', 'Kd (nM)_ref',
    'EC50 (nM)', 'EC50 (nM)_ref'
]


# Add any other columns you want after the main block:
remaining_cols = [col for col in merged.columns if col not in ordered_cols]
final_cols = ordered_cols + remaining_cols



metrics = [
    ("Ki (nM)", "Ki (nM)_ref"),
    ("IC50 (nM)", "IC50 (nM)_ref"),
    ("Kd (nM)", "Kd (nM)_ref"),
    ("EC50 (nM)", "EC50 (nM)_ref"),
]

for df in [filtered_ki, filtered_ic, filtered_ec, filtered_kd]:
    df = df.copy()
    for main_col, ref_col in metrics:
        if main_col in df.columns:
            df.loc[:, main_col] = pd.to_numeric(df[main_col], errors='coerce')
        if ref_col in df.columns:
            df.loc[:, ref_col] = pd.to_numeric(df[ref_col], errors='coerce')



filtered_ki = filtered_ki[final_cols]
filtered_ic = filtered_ic[final_cols]
filtered_ec = filtered_ec[final_cols]
filtered_kd = filtered_kd[final_cols]



In [37]:
filtered_ki

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,protein_target_name,patent_number,molecule_name,Metric,Ligand SMILES
11,AXKPFOAXAHJUAG-UHFFFAOYSA-N,MGNRSTADADGLLAGRGPAAGASAGASAGLAGQGAAALVGGVLLIG...,10.0,5.1,,,,,,,,US-7884096-B2,,Ki (nM),
13,BLGXFZZNTVWLAY-SCYLSFHTSA-N,MASPALAAALAAAAAEGPNGSDAGEWGSGGGANASGTDWGPPPGQY...,25000.0,0.44,,,,,,,,EP-1534313-B1,,Ki (nM),
22,BQJCRHHNABKAKU-KBQPJGBKSA-N,MDSPIQIFRGEPGPTCAPSACLPPNSSAWFPGWAEPDSNGSAGSED...,118.38,11.1,,,,,,,kappa opioid receptor,US-8952032-B2,Morphine,Ki (nM),
25,BQJCRHHNABKAKU-KBQPJGBKSA-N,MDSSAAPTNASNCTDALAYSSCSPAPSPGSWVNLSHLDGNLSDPCG...,8.44,0.14,,,,,,,mu opioid receptor,US-8952032-B2,Morphine,Ki (nM),
28,BQJCRHHNABKAKU-KBQPJGBKSA-N,MEPVPSARAELQFSLLANVSDTFPSAFPSASANASGSPGARSASSL...,4297.0,1.8,,,,,,,delta opioid receptor,US-8952032-B2,Morphine,Ki (nM),
29,BRUQQQPBMZOVGD-XFKAJCMBSA-N,MDSSAAPTNASNCTDALAYSSCSPAPSPGSWVNLSHLDGNLSDPCG...,133.48,12.0,,,,,,,mu opioid receptor,US-8952032-B2,Oxycodone,Ki (nM),
39,FJDDSMSDZHURBJ-UHFFFAOYSA-N,MQGNGSALPNASQPVLRGDGARPSWLASALACVLIFTIVVDILGNL...,0.028,0.01,,,,,,,,,,Ki (nM),
80,LOCQRDBFWSXQQI-UHFFFAOYSA-N,MVPEPGPTANSTPAWGAGPPSAPGGSGWVAAALCVVIALTAAANSL...,1.0,0.630957,,,,,,,,EP-1471912-A1,,Ki (nM),
93,NETZHAKZCGBWSS-CEDHKZHLSA-N,MDSPIQIFRGEPGPTCAPSACLPPNSSAWFPGWAEPDSNGSAGSED...,25.9,2.2,,,,,,,kappa opioid receptor,US-8952032-B2,nalbuphine,Ki (nM),
96,NETZHAKZCGBWSS-CEDHKZHLSA-N,MDSSAAPTNASNCTDALAYSSCSPAPSPGSWVNLSHLDGNLSDPCG...,114.3,0.89,,,,,,,mu opioid receptor,US-8952032-B2,nalbuphine,Ki (nM),


In [38]:
filtered_ic

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,protein_target_name,patent_number,molecule_name,Metric,Ligand SMILES
169,AATCBLYHOUOCTO-UHFFFAOYSA-N,MAGSGAGVRCSLLRLQETLSAADRCGAALAGHQLIRGLGQECVLSS...,,,10.0,5,,,,,,US-20100130597-A1,,IC50 (nM),
172,AILRADAXUVEEIR-UHFFFAOYSA-N,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,,,0.6,0.370000,,,,,,US-12257245-B2,,IC50 (nM),
180,BCEHBSKCWLPMDN-MGPLVRAMSA-N,MDSLVVLVLCLSCLLLLSLWRQSSGRGKLPPGPTPLPVIGNILQIG...,,,16.0,14000,,,,,,EP-2723731-A1,,IC50 (nM),
182,BCSHRERPHLTPEE-NRFANRHFSA-N,MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...,,,2.3,2.0,,,,,,US-12257245-B2,,IC50 (nM),
184,BCSHRERPHLTPEE-NRFANRHFSA-N,MGAIGLLWLLPLLLSTAAVGSGMGTGQRAGSPAAGPPLQPREPLSY...,,,120.0,175,,,,,,US-12257245-B2,,IC50 (nM),
192,CDMGBJANTYXAIV-UHFFFAOYSA-N,MSGPRAGFYRQELNKTVWEVPQRLQGLRPVGSGAYGSVCSAYDARL...,,,600.0,430,,,,,,,,IC50 (nM),
229,GNWHRHGTIBRNSM-UHFFFAOYSA-N,MPPGVDCPMEFWTKEENQSVVVDFLLPTGVYLNFPVSRNANLSTIK...,,,100.0,13,,,,,,US-12023342-B2,,IC50 (nM),
231,GUGOEEXESWIERI-UHFFFAOYSA-N,MPVRRGHVAPQNTFLDTIIRKFEGQSRKFIIANARVENCAVIYCND...,,,99.0,1200,,,,,,EP-3326616-A1,,IC50 (nM),
234,GXDALQBWZGODGZ-UHFFFAOYSA-N,MPVRRGHVAPQNTFLDTIIRKFEGQSRKFIIANARVENCAVIYCND...,,,50.0,0.9,,,,,,EP-3326616-A1,,IC50 (nM),
242,HAYYBYPASCDWEQ-UHFFFAOYSA-N,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,,,3.0,1.000000,,,,,,US-12257245-B2,,IC50 (nM),


In [34]:
filtered_ec

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,protein_target_name,patent_number,molecule_name,Metric,Ligand SMILES
584,NETZHAKZCGBWSS-CEDHKZHLSA-N,MDSPIQIFRGEPGPTCAPSACLPPNSSAWFPGWAEPDSNGSAGSED...,,,,,,,25.1,25.1,Kappa opioid receptor,US-8952032-B2,Nalbuphine,EC50 (nM),
599,RCEFMOGVOYEGJN-UHFFFAOYSA-N,MSFRAARLSMRNRRNDTLDSTRTLYSSASRSTDLSYSESDLVNFIQ...,,,,,,,200.0,300.0,TRPM8,EP-2029614-A2,Icilin,EC50 (nM),


In [35]:
filtered_kd

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,protein_target_name,patent_number,molecule_name,Metric,Ligand SMILES
457,ATSPSFUHQKFBTB-DOUURKCASA-N,MRLNSSAPGTPGTPAADPFQRAQAGLEEALLAPGFGNASGNASERV...,,,,,0.82,1.55,,,,US-20030166505-A1,,Kd (nM),
474,GUBGYTABKSRVRQ-DCSYEGIMSA-N,MAFSGSQAPYLSPAVPFSGTIQGGLQDGLQITVNGTVLSSSGTRFA...,,,,,190000.0,23000.0,,,,,,Kd (nM),
482,JGWRKYUXBBNENE-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,,,5.31,360.0,,,,WO-2019204604-A1,,Kd (nM),
518,SHHUPGSHGSNPDB-UHFFFAOYSA-N,MNEPTENRLGCSRTPEPDIRLRKGHQLDGTRRGDNDSHQGDLEPIL...,,,,,3100.0,3100.0,,,,US-20170298051-A1,,Kd (nM),


In [33]:
test_df = pd.read_csv('../../BindingDB_singlechain_deduplicated.csv')


result = test_df[
    (test_df["Ligand InChI Key"] == "KTUFNOKKBVMGRW-UHFFFAOYSA-N") &
    (test_df["Sequence"].str.contains("MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNGSVEWDGPPSPHWTLYSDGSSSILSTNNATFQNTGTYRCTEPGDPLGGSAAIHLYVKDPARPWNVLAQEVVVFEDQDALLPCLLTDPVLEAGVSLVRVRGRPLMRHTNYSFSPWHGFTIHRAKFIQSQDYQCSALMGGRKVMSISIRLKVQKVIPGPPALTLVPAELVRIRGEAAQIVCSASSVDVNFDVFLQHNNTKLAIPQQSDFHNNRYQKVLTLNLDQVDFQHAGNYSCVASNVQGKHSTSMFFRVVESAYLNLSSEQNLIQEVTVGEGLNLKVMVEAYPGLQGFNWTYLGPFSDHQPEPKLANATTKDTYRHTFTLSLPRLKPSEAGRYSFLARNPGGWRALTFELTLRYPPEVSVIWTFINGSGTLLCAASGYPQPNVTWLQCSGHTDRCDEAQVLQVWDDPYPEVLSQEPFHKVTVQSLLTVETLEHNQTYECRAHNSVGSGSWAFIPISAGAHTHPPDEFLFTPVVVACMSIMALLLLLLLLLLYKYKQKPKYQVRWKIIESYEGNSYTFIDPTQLPYNEKWEFPRNNLQFGKTLGAGAFGKVVEATAFGLGKEDAVLKVAVKMLKSTAHADEKEALMSELKIMSHLGQHENIVNLLGACTHGGPVLVITEYCCYGDLLNFLRRKAEAMLGPSLSPGQDPEGGVDYKNIHLEKKYVRRDSGFSSQGVDTYVEMRPVSTSSNDSFSEQDLDKEDGRPLELRDLLHFSSQVAQGMAFLASKNCIHRDVAARNVLLTNGHVAKIGDFGLARDIMNDSNYIVKGNARLPVKWMAPESIFDCVYTVQSDVWSYGILLWEIFSLGLNPYPGILVNSKFYKLVKDGYQMAQPAFAPKNIYSIMQACWALEPTHRPTFQQICSFLQEQAQEDRRERDYTNLPSSSRSGGSGSSSSELEEESSSEHLTCCEQGDIAQPLLQPNNYQFC", na=False))
]
result


Unnamed: 0,Ligand SMILES,Ligand InChI,Ligand InChI Key,Target Name,Ki (nM),IC50 (nM),Kd (nM),EC50 (nM),Curation/DataSource,Sequence
1743782,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,InChI=1S/C29H31N7O/c1-21-5-10-25(18-27(21)34-2...,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Macrophage colony-stimulating factor 1 receptor,,1274.0,,,ChEMBL,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...
1893337,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,InChI=1S/C29H31N7O/c1-21-5-10-25(18-27(21)34-2...,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Macrophage colony-stimulating factor 1 receptor,,,10.0,,ChEMBL,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...
