In [96]:
import pandas as pd
import os
import torch
import json
import numpy as np

In [97]:
%run Config.ipynb

In [98]:
config = Config()

#### Rmsd's 

In [99]:
rmsd = pd.read_csv(f"{config.data}/rmsd.csv")

In [100]:
rmsd['pdb'].nunique()

39

Obs: There is one pdb missing!

In [101]:
rmsd[rmsd['pdb'] == "4kz6"]['poserank'].value_counts()

Series([], Name: poserank, dtype: int64)

Complexes are labeled as correct if they fall within 2.0 Angstroms heavy-atom RMSD of the reference ligand structure, a standard cutoff in the field (see e.g. Ref. 44). Correct poses are labeled “positive,” and those that fall outside the cutoff are labeled as “negative.”

#### Creating label 

In [102]:
label = rmsd.copy()

#### Score similarity 

In [103]:
score = pd.read_json(f"{config.data}/score.json").unstack().reset_index().rename(columns = {"level_0":"pdb", "level_1":"poserank", 0:'similarity'})

#### Merge

In [104]:
label = pd.merge(score, label, left_on= ['pdb','poserank'], right_on = ['pdb','poserank'], how = 'outer').drop_duplicates()

In [105]:
#label.sort_values('similarity', ascending = False).tail(30)

| score | rmsd |
|-------|------|
| rmsd < 2 | -10 |
| 2 < rmsd < 3 | -7 |
| rmsd > 3 | 0 |

| score | match |
|-------|-------|
| 79 | -10 |
| 70 | -7 |
| 66 | 0 |
| 42 | 0 |


In [106]:
# Define the conditions and corresponding scores
conditions_rmsd = [ label['RMSD'] < 2
              ,(label['RMSD'] >= 2) & (label['RMSD'] < 3)
              , label['RMSD'] >= 3]

In [107]:
conditions_similarity = [ label['similarity'] >=80 
              ,(label['similarity'] >= 70) & (label['similarity'] < 80)
              #,(label['similarity'] >= 66) & (label['similarity'] < 70)
              , label['similarity'] < 70 ]

In [108]:
scores_rmsd       = [10, 7, 0]
scores_similarity = [10, 7, 0]

label['rmsd_weight'] = np.select(conditions_rmsd, scores_rmsd, default=np.nan)
label['similarity_weight'] = np.select(conditions_similarity, scores_similarity, default=np.nan)

In [109]:
label["pdb"].nunique()

40

In [110]:
#label['RMSD'].hist()

$$
\frac{{\text{{score\_rmsd}} \times 0.5 + \text{{score\_match}} \times 0.5}}{{20}}
$$


In [114]:
label['weight'] = (label['rmsd_weight']*0.5 + label['similarity_weight']*0.5)/10

In [115]:
label["label"] = label["weight"].apply(lambda x: 1 if x > 0.5 else 0)

In [116]:
label[label['pdb'] == "1ydr"]

Unnamed: 0,pdb,poserank,similarity,RMSD,rmsd_weight,similarity_weight,weight,label
110,1ydr,1,82.089552,0.58,10.0,10.0,1.0,1
121,1ydr,2,5.970149,3.5,0.0,0.0,0.0,0
132,1ydr,3,5.970149,3.75,0.0,0.0,0.0,0
143,1ydr,4,13.432836,2.7,7.0,0.0,0.35,0
154,1ydr,5,7.462687,3.49,0.0,0.0,0.0,0
165,1ydr,6,13.432836,3.53,0.0,0.0,0.0,0
176,1ydr,7,47.761194,2.55,7.0,0.0,0.35,0
187,1ydr,8,5.970149,3.79,0.0,0.0,0.0,0
198,1ydr,9,0.0,10.15,0.0,0.0,0.0,0
209,1ydr,10,0.0,1.82,10.0,0.0,0.5,0


In [117]:
label.head()

Unnamed: 0,pdb,poserank,similarity,RMSD,rmsd_weight,similarity_weight,weight,label
0,2qbq,1,67.153285,0.87,10.0,0.0,0.5,0
11,2qbq,2,84.671533,0.4,10.0,10.0,1.0,1
22,2qbq,3,45.985401,7.37,0.0,0.0,0.0,0
33,2qbq,4,70.072993,5.79,0.0,7.0,0.35,0
44,2qbq,5,54.014599,2.32,7.0,0.0,0.35,0


In [118]:
label['label'].value_counts()

0    372
1     28
Name: label, dtype: int64

    0    8570
    1    4575
    Name: label, dtype: int64

In [119]:
len(label['label'])

400

In [120]:
label['pdb'].nunique()

40

#### Creating dict 

In [121]:
# Create the nested dictionary
result_dict = {}
for index, row in label.iterrows():
    pdb = row['pdb']
    poserank = row['poserank']
    label = row['label']
    
    if pdb not in result_dict:
        result_dict[pdb] = {}
    
    result_dict[pdb][poserank] = label

In [122]:
print("label.json saved at:", config.data)

label.json saved at: ../Datahub/Data


In [123]:
# Assuming result_dict is your dictionary
with open(f"{config.data}/label.json", "w") as json_file:
    json.dump(result_dict, json_file)

In [124]:
# Assuming result_dict is your dictionary
with open(f"{config.data}/label.json", "r") as json_file:
    result = json_file.read()

In [125]:
result = pd.read_json(f"{config.data}/label.json")

In [126]:
result

Unnamed: 0,2qbq,1ydr,3b68,1owh,2vw5,2fxs,2cet,1ydt,2fvd,3ao4,...,1bzc,1bcu,1s38,2yfe,2wvt,2w66,1h23,2xb8,1z6e,3bgz
1,0,1,0,0,1,0,0,0,0,0,...,1,1,1,0,1,1,0,1,1,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Report 

| score | rmsd |
|-------|------|
| rmsd < 2 | -10 |
| 2 < rmsd < 3 | -7 |
| rmsd > 3 | 0 |

| score | match |
|-------|-------|
| > 80 | -10 |
| >= 70 | -7 |
| < 70 | 0| 



| Caso | PDB   | pose | similarity | rmsd | w_rmsd | w_similarity |
|------|-------|------|------------|------|--------|--------------|
| 1    | 2qbq  | 1    | 59.854015  | 1.15 | 10.0   | 0.0          |
| 2    | 2qbp  | 10   | 0.0        | 1.73 | -      | -            |
| 3    | 1ydr  | 10   | 0.0        | 2.17 | -      | -            |
| 4    | 1bzc  | 10   | 0.0        | 1.64 | -      | -            |
