In [1]:
import pandas as pd

# Load your data
our_df = pd.read_csv('bindb.csv')
test_df = pd.read_csv('../../BindingDB_singlechain.csv')

In [2]:
# Remove duplicates in both dataframes
our_df = our_df.drop_duplicates()
test_df = test_df.drop_duplicates()

In [3]:
# Choose the columns for comparison
compare_cols = ['Ligand InChI Key', 'Sequence']

# Merge by (Ligand InChI Key, Sequence), keeping only values from our_df, and bringing in etalon values from test_df
merged = pd.merge(
    our_df,
    test_df[compare_cols + ['Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)']],
    on=compare_cols,
    how='left',
    suffixes=('', '_ref')
)

In [8]:
# After running the previous merge:
filtered_ki = merged[
    merged['Ki (nM)'].notnull() & merged['Ki (nM)_ref'].notnull()
]

filtered_ic = merged[
    merged['IC50 (nM)'].notnull() & merged['IC50 (nM)_ref'].notnull()
]

filtered_ec = merged[
    merged['EC50 (nM)'].notnull() & merged['EC50 (nM)_ref'].notnull()
]

filtered_kd = merged[
    merged['Kd (nM)'].notnull() & merged['Kd (nM)_ref'].notnull()
]


In [9]:
# Reorder columns: group each measure with its ref
ordered_cols = [
    'Ligand InChI Key', 'Sequence',
    'Ki (nM)', 'Ki (nM)_ref',
    'IC50 (nM)', 'IC50 (nM)_ref',
    'Kd (nM)', 'Kd (nM)_ref',
    'EC50 (nM)', 'EC50 (nM)_ref'
]

# Add any other columns you want after the main block:
remaining_cols = [col for col in merged.columns if col not in ordered_cols]
final_cols = ordered_cols + remaining_cols

filtered_ki = filtered_ki[final_cols]
filtered_ic = filtered_ic[final_cols]
filtered_ec = filtered_ec[final_cols]
filtered_kd = filtered_kd[final_cols]


In [10]:
filtered_ki.head()

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,Ligand SMILES,unit,patent_number,molecule_name,protein_target_name
86,XXCCRHIAIBQDPX-PEWBXTNBSA-N,MEEGGDFDNYYGADNQSECEYTDWKSSGALIPAIYMLVFLLGTTGN...,20.0,0.7,,,,,,,,nM,WO-2023108291-A1,Apelin 13,APJ receptor
787,LOCQRDBFWSXQQI-UHFFFAOYSA-N,MVPEPGPTANSTPAWGAGPPSAPGGSGWVAAALCVVIALTAAANSL...,1.258925,0.66,,,,,,,,nM,EP-1471912-A1,SB-271046,5-HT6 receptor
788,LOCQRDBFWSXQQI-UHFFFAOYSA-N,MVPEPGPTANSTPAWGAGPPSAPGGSGWVAAALCVVIALTAAANSL...,1.258925,0.81,,,,,,,,nM,EP-1471912-A1,SB-271046,5-HT6 receptor
789,LOCQRDBFWSXQQI-UHFFFAOYSA-N,MVPEPGPTANSTPAWGAGPPSAPGGSGWVAAALCVVIALTAAANSL...,1.258925,1.0,,,,,,,,nM,EP-1471912-A1,SB-271046,5-HT6 receptor
790,LOCQRDBFWSXQQI-UHFFFAOYSA-N,MVPEPGPTANSTPAWGAGPPSAPGGSGWVAAALCVVIALTAAANSL...,1.258925,3.0,,,,,,,,nM,EP-1471912-A1,SB-271046,5-HT6 receptor


In [11]:
filtered_ic.head()

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,Ligand SMILES,unit,patent_number,molecule_name,protein_target_name
32,ZIQFYVPVJZEOFS-UHFFFAOYSA-N,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,,,8.0,2.8,,,,,,nM,WO-2002102976-A2,PD 166326,c-abl
33,ZIQFYVPVJZEOFS-UHFFFAOYSA-N,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,,,8.0,8.0,,,,,,nM,WO-2002102976-A2,PD 166326,c-abl
58,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,1400.0,291.0,,,,,,nM,EP-2736489-A1,Imatinib,c-fms
60,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,1400.0,960.0,,,,,,nM,EP-2736489-A1,Imatinib,c-fms
62,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,1400.0,1274.0,,,,,,nM,EP-2736489-A1,Imatinib,c-fms


In [11]:
filtered_ic.head()

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,Ligand SMILES,unit,patent_number,molecule_name,protein_target_name
32,ZIQFYVPVJZEOFS-UHFFFAOYSA-N,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,,,8.0,2.8,,,,,,nM,WO-2002102976-A2,PD 166326,c-abl
33,ZIQFYVPVJZEOFS-UHFFFAOYSA-N,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,,,8.0,8.0,,,,,,nM,WO-2002102976-A2,PD 166326,c-abl
58,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,1400.0,291.0,,,,,,nM,EP-2736489-A1,Imatinib,c-fms
60,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,1400.0,960.0,,,,,,nM,EP-2736489-A1,Imatinib,c-fms
62,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,1400.0,1274.0,,,,,,nM,EP-2736489-A1,Imatinib,c-fms


In [11]:
filtered_ic.head()

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,Ligand SMILES,unit,patent_number,molecule_name,protein_target_name
32,ZIQFYVPVJZEOFS-UHFFFAOYSA-N,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,,,8.0,2.8,,,,,,nM,WO-2002102976-A2,PD 166326,c-abl
33,ZIQFYVPVJZEOFS-UHFFFAOYSA-N,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,,,8.0,8.0,,,,,,nM,WO-2002102976-A2,PD 166326,c-abl
58,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,1400.0,291.0,,,,,,nM,EP-2736489-A1,Imatinib,c-fms
60,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,1400.0,960.0,,,,,,nM,EP-2736489-A1,Imatinib,c-fms
62,KTUFNOKKBVMGRW-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,1400.0,1274.0,,,,,,nM,EP-2736489-A1,Imatinib,c-fms


In [12]:
filtered_ec.head()

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,Ligand SMILES,unit,patent_number,molecule_name,protein_target_name


In [13]:
filtered_kd.head()

Unnamed: 0,Ligand InChI Key,Sequence,Ki (nM),Ki (nM)_ref,IC50 (nM),IC50 (nM)_ref,Kd (nM),Kd (nM)_ref,EC50 (nM),EC50 (nM)_ref,Ligand SMILES,unit,patent_number,molecule_name,protein_target_name
11,JGWRKYUXBBNENE-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,,,5.31,360.0,,,,nM,WO-2019204604-A1,PLX3397,CSF1 receptor
12,JGWRKYUXBBNENE-UHFFFAOYSA-N,MGPGVLLLLLVATAWHGQGIPVIEPSVPELVVKPGATVTLRCVGNG...,,,,,5.31,5.8,,,,nM,WO-2019204604-A1,PLX3397,CSF1 receptor
951,SHHUPGSHGSNPDB-UHFFFAOYSA-N,MNEPTENRLGCSRTPEPDIRLRKGHQLDGTRRGDNDSHQGDLEPIL...,,,,,3100.0,3100.0,,,,nM,US-20170298051-A1,AdipoRon,AdipoR2
1112,ATSPSFUHQKFBTB-DOUURKCASA-N,MRLNSSAPGTPGTPAADPFQRAQAGLEEALLAPGFGNASGNASERV...,,,,,0.82,1.55,,,,nM,US-20030166505-A1,NT69L,NTR
1165,ATSPSFUHQKFBTB-DOUURKCASA-N,MRLNSSAPGTPGTPAADPFQRAQAGLEEALLAPGFGNASGNASERV...,,,,,1.55,1.55,,,,nM,US-20030166505-A1,NT69L,NTR


In [None]:
# Optional: Reorder columns for clarity
cols = (
    compare_cols +
    [col for col in our_df.columns if col not in compare_cols] +
    ['Ki (nM)_ref', 'IC50 (nM)_ref', 'Kd (nM)_ref', 'EC50 (nM)_ref']
)
merged = merged[cols]

# Save or display result
merged.to_csv('comparison.csv', index=False)
print(merged.head())
