In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import rdkit.Chem.AllChem as Chem
from itertools import permutations
import math
%matplotlib inline

In [2]:
DG_data = pd.read_excel('ja512751q_si_003.xlsx',sheet_name=1,engine='openpyxl')
DG_data.dropna(0,how='all',inplace=True)

  warn(msg)


In [3]:
system_names = []
for idx, row in DG_data.iterrows():
    if pd.notna(row['Systems']):
        if idx != 0:
            print(f"{row['Systems']},{idx}")
            system_names.append((prev_sys,idx-1))
        prev_sys = row['Systems']
system_names.append((prev_sys,idx))

CDK2,37
JNK1,54
MCL1,76
P38,119
Thrombin,154
Tyk2,166
PTP1B,183


In [4]:
prev_last = 0
for name,last_row in system_names:
    if name == 'Tyk2':
        name = 'TYK2'
    DG_data.at[prev_last:last_row,'Systems'] = name
    prev_last = last_row

In [5]:
ligand_map = dict()
for system,_ in system_names:
    if system == 'Tyk2':
        system = 'TYK2'
    for file in glob(f'{system}/*_LIG*.sdf'):
        mol = Chem.MolFromMolFile(file)
        lig_name = mol.GetProp('_Name')
        ligand_map[f"{system} {lig_name}"] = file

In [6]:
DG_data['lig'] = [f"{x} {y}" for x, y in zip(DG_data['Systems'], DG_data['Ligand'])]
DG_data['lig']=DG_data['lig'].map(ligand_map)

We need to go from $\Delta G_{Exp}$ to pIC$_{50}$ and we have $\Delta G_{Exp}=RT\ln(\mathrm{IC}_{50})$

Therefore:

$\mathrm{pIC}_{50} = -\log_{10}(e^{\Delta G_{Exp}/(RT)})$

In [7]:
# Given in the paper
R=1.98720425864083E-3
T=297

In [8]:
DG_data['pIC50'] = DG_data['Exp. dG'].apply(lambda x: -np.log10(np.exp(x/(R*T))))

In [10]:
dg_grouped = DG_data.groupby('Systems')
full_types_list = []
for rec, group in dg_grouped:
    for idx1, idx2 in list(permutations(group.index,2)):
        regression = float(group.loc[idx1,'pIC50']) - float(group.loc[idx2,'pIC50']) 
        if regression == np.inf or regression == -np.inf or math.isnan(regression):
            continue  
        try:   
            classification = int(float(group.loc[idx1,'pIC50']) > float(group.loc[idx2,'pIC50']))
        except:
            print(train_group)
            break
        dg_lig1 = float(group.loc[idx1,'pIC50'])
        dg_lig2 = float(group.loc[idx2,'pIC50'])
        receptor = f"{rec}/{rec}_PROT.pdb"   
        lig1 = f"{group.loc[idx1,'lig']}"   
        lig2 = f"{group.loc[idx2,'lig']}"
        full_types_list.append([classification, regression, dg_lig1, dg_lig2, receptor,lig1,lig2]) 
full_types_df_best = pd.DataFrame(full_types_list,columns=['class','reg','dg_lig1','dg_lig2','rec','lig1','lig2'])
#full_types_df_best.to_csv('all_bdb_types_perm_mult.types',sep=' ',header=False,index=False)"    
    

In [11]:
for rec in full_types_df_best['rec'].unique():
    rec_subdf = full_types_df_best[full_types_df_best['rec'] == rec]
    rec_name = rec.split('/')[0]
    rec_subdf.to_csv(f"{rec_name}/{rec_name}_DDG.types",sep=' ',header=False,index=False,float_format='%.4f')