In [1]:
import pandas as pd 
import numpy as np
import os
from math import ceil

In [2]:
import sys
sys.path.append('../evaluation/')

In [3]:
from utils import build_pdb_dict

In [23]:
bridge_type = 'vp'

log_path = '../../docking_res/logs/' + bridge_type
raw_data_path = '../../data/cleaned_crossdocked_data/raw'

log_files = os.listdir(log_path)
pdb_dict, pdb_rev_dict = build_pdb_dict(raw_data_path)

In [24]:
log_files

['1gs4_A_rec_5t8e_77u_lig_tt_min_0.log',
 '4f8b_B_rec_4fgc_pq0_lig_it1_it2_tt_docked_1.log',
 '3b67_A_rec_3b66_b66_lig_tt_docked_3.log',
 '3omm_A_rec_5q1d_9nd_lig_tt_min_0.log',
 '5j3l_A_rec_5ioz_6c4_lig_tt_docked_5.log',
 '4u6e_A_rec_4u6e_q02_lig_tt_docked_1.log',
 '3uzz_B_rec_3uzx_aox_lig_tt_min_0.log',
 '4l7r_A_rec_4l6z_1dc_lig_tt_min_0.log',
 '2b3l_A_rec_4u69_q07_lig_tt_min_0.log',
 '4nyx_A_rec_5mqk_qpr_lig_tt_min_0.log',
 '5ewh_A_rec_5eps_5qx_lig_tt_docked_12.log',
 '2qvd_A_rec_2qu9_eug_lig_tt_min_0.log',
 '3q3s_A_rec_5j1u_p93_lig_tt_min_0.log',
 '4za0_A_rec_3ujs_xsp_lig_tt_docked_13.log',
 '4og8_A_rec_4og8_tbf_lig_tt_min_0.log',
 '3tp0_A_rec_5f1j_5to_lig_tt_docked_4.log',
 '2nw4_A_rec_5t8e_77u_lig_tt_min_0.log',
 '5ffy_A_rec_5c87_4ys_lig_tt_docked_0.log',
 '4kz3_A_rec_4kz3_1u1_lig_tt_min_0.log',
 '4fli_A_rec_4u6e_q02_lig_tt_min_0.log',
 '3u2o_A_rec_3f1q_bce_lig_tt_docked_11.log',
 '5i86_B_rec_5tb6_77x_lig_tt_min_0.log',
 '4l6z_A_rec_3fhb_gab_lig_tt_min_0.log',
 '5mqk_B_rec_3p1c_a

In [25]:
def get_minimized_affinity(filename, mode='ref'):
    if mode == 'ref':
        with open(filename, 'r') as file:
            for line in file:
                if '<minimizedAffinity>' in line:
                    # The next line after '<minimizedAffinity>' contains the value
                    return float(next(file))
            
    else:
        with open(filename, 'r') as f:
            lines = f.readlines()
            lines = lines[19:28]
#             print(len(lines))
            n = len(lines)
            arr = np.zeros(n)
#             print(filename)
            for i in range(n):
                arr[i] = float(lines[i][11:17])
#             print(arr)
            try:
                score = np.min(arr)
            except:
                score = None
        return score

In [26]:
score_dict = {}
for log_file in log_files:
    ligand_name = log_file.split('.')[0]
    ligand_file = ligand_name + '.sdf'
    pdb_folder = pdb_rev_dict[ligand_file]
    ref_file = os.path.join(raw_data_path, pdb_folder, ligand_file)
    ref_score = get_minimized_affinity(ref_file, 'ref')
#     print(log_file)
    gen_score = get_minimized_affinity(os.path.join(log_path, log_file), 'gen')
    score_dict[ligand_name] = [gen_score, ref_score]

In [27]:
df = pd.DataFrame.from_dict(score_dict, orient='index', columns=['Binding affinity', 'Reference'])
df

Unnamed: 0,Binding affinity,Reference
1gs4_A_rec_5t8e_77u_lig_tt_min_0,-6.62,-8.10158
4f8b_B_rec_4fgc_pq0_lig_it1_it2_tt_docked_1,-5.36,-1.65858
3b67_A_rec_3b66_b66_lig_tt_docked_3,-11.24,-9.64387
3omm_A_rec_5q1d_9nd_lig_tt_min_0,-12.72,-12.99205
5j3l_A_rec_5ioz_6c4_lig_tt_docked_5,-7.24,-8.50609
...,...,...
2nn1_A_rec_2nmx_m25_lig_tt_docked_2,-7.01,-6.21650
3p6f_A_rec_3p6h_ibp_lig_tt_docked_3,-5.17,-6.39480
5wbe_A_rec_1q4g_bfl_lig_tt_docked_3,-6.44,-8.35387
3hw5_A_rec_5d9j_0n8_lig_tt_min_0,-8.93,-6.05772


In [28]:
sum(df['Binding affinity'] < df['Reference']) / len(df)

0.5696055684454756

In [29]:
pdb_rev_dict['4ls1_A_rec_5h73_7l7_lig_tt_min_0.sdf']

'PYRD_HUMAN_65_395_0'

In [None]:
def evaluate_bdb(model, meta_data):
    score_dict = {}
    for index, row in meta_data.iterrows():
        pdb_id = row['pdb']
#         print(pdb_id)
        score_dict[pdb_id] = []
        log_path = os.path.join('logs/' + model, pdb_id)
        for filename in os.listdir(log_path):
            log_file = os.path.join(log_path, filename)
            with open(log_file, 'r') as f:
                lines = f.readlines()
#             print(len(lines))
            lines = lines[16:]
            n = ceil(len(lines)/14)
            arr = np.zeros(n)
#             print(filename)
            for i in range(n):
                arr[i] = float(lines[i*14+3][11:17])
            score = np.min(arr)
            smi = filename.split('.')[0]
            score_dict[pdb_id].append(score)
            
    score_dict = {k:np.array(v) for k, v in score_dict.items()}
    return score_dict