In [1]:
import pandas as pd
import os
import torch
import json
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%run Config.ipynb

In [3]:
config = Config()

#### Rmsd's 

In [4]:
rmsd = pd.read_csv(f"{config.data}/rmsd.csv")

In [5]:
rmsd['pdb'].nunique()

40

Obs: There is one pdb missing!

In [135]:
rmsd[rmsd['pdb'] == "4kz6"]['poserank'].value_counts()

Series([], Name: poserank, dtype: int64)

Complexes are labeled as correct if they fall within 2.0 Angstroms heavy-atom RMSD of the reference ligand structure, a standard cutoff in the field (see e.g. Ref. 44). Correct poses are labeled “positive,” and those that fall outside the cutoff are labeled as “negative.”

#### Creating label 

In [136]:
label = rmsd.copy()

#### Score similarity 

In [137]:
score = pd.read_json(f"{config.data}/score.json").unstack().reset_index().rename(columns = {"level_0":"pdb", "level_1":"poserank", 0:'similarity'})

#### Merge

In [138]:
label = pd.merge(score, label, left_on= ['pdb','poserank'], right_on = ['pdb','poserank'], how = 'outer')

In [139]:
#label.sort_values('similarity', ascending = False).tail(30)

| score | rmsd |
|-------|------|
| rmsd < 2 | -10 |
| 2 < rmsd < 3 | -7 |
| rmsd > 3 | 0 |

| score | match |
|-------|-------|
| 79 | -10 |
| 70 | -7 |
| 66 | 0 |
| 42 | 0 |


In [140]:
# Define the conditions and corresponding scores
conditions_rmsd = [ label['RMSD'] < 2
              ,(label['RMSD'] >= 2) & (label['RMSD'] < 3)
              , label['RMSD'] >= 3]

In [141]:
conditions_similarity = [ label['similarity'] >=79 
              ,(label['similarity'] >= 70) & (label['similarity'] < 79)
              ,(label['similarity'] >= 66) & (label['similarity'] < 70)
              , label['similarity'] < 66 ]

In [142]:
scores_rmsd       = [10, 7, 0]
scores_similarity = [10, 7, 0, 0]

label['rmsd_weight'] = np.select(conditions_rmsd, scores_rmsd, default=np.nan)
label['similarity_weight'] = np.select(conditions_similarity, scores_similarity, default=np.nan)

In [143]:
label["pdb"].nunique()

40

In [1]:
#label['RMSD'].hist()

(score_rmsd * 1 + score_match * 1  )/20

In [146]:
label['weight'] = (label['rmsd_weight'] + label['similarity_weight'])/20

In [147]:
label["label"] = label["weight"].apply(lambda x: 1 if x > 0.5 else 0)

In [148]:
label.head()

Unnamed: 0,pdb,poserank,similarity,RMSD,rmsd_weight,similarity_weight,weight,label
0,2qbq,1,59.854015,1.15,10.0,0.0,0.5,0
1,2qbq,2,73.722628,0.45,10.0,7.0,0.85,1
2,2qbq,3,54.744526,3.49,0.0,0.0,0.0,0
3,2qbq,4,53.284672,1.36,10.0,0.0,0.5,0
4,2qbq,5,54.744526,4.28,0.0,0.0,0.0,0


In [149]:
label['label'].value_counts()

0    370
1     30
Name: label, dtype: int64

    0    8570
    1    4575
    Name: label, dtype: int64

In [150]:
len(label['label'])

400

In [151]:
label['pdb'].nunique()

40

#### Creating dict 

In [152]:
# Create the nested dictionary
result_dict = {}
for index, row in label.iterrows():
    pdb = row['pdb']
    poserank = row['poserank']
    label = row['label']
    
    if pdb not in result_dict:
        result_dict[pdb] = {}
    
    result_dict[pdb][poserank] = label

In [153]:
print("label.json saved at:", config.data)

'.//Datahub/Data'

In [154]:
# Assuming result_dict is your dictionary
with open(f"{config.data}/label.json", "w") as json_file:
    json.dump(result_dict, json_file)

In [155]:
# Assuming result_dict is your dictionary
with open(f"{config.data}/label.json", "r") as json_file:
    result = json_file.read()

In [156]:
result = pd.read_json(f"{config.data}/label.json")

In [157]:
result

Unnamed: 0,2qbq,1ydr,3b68,1owh,2vw5,2fxs,2cet,1ydt,2fvd,3ao4,...,1bzc,1bcu,1s38,2yfe,2wvt,2w66,1h23,2xb8,1z6e,3bgz
1,0,1,0,0,1,0,0,0,0,0,...,0,1,1,0,1,1,0,1,1,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Report 

| score | rmsd |
|-------|------|
| rmsd < 2 | -10 |
| 2 < rmsd < 3 | -7 |
| rmsd > 3 | 0 |

| score | match |
|-------|-------|
| 79 | -10 |
| 70 | -7 |
| 66 | 0 |
| 42 | 0 |


| Caso | PDB   | pose | similarity | rmsd | w_rmsd | w_similarity |
|------|-------|------|------------|------|--------|--------------|
| 1    | 2qbq  | 1    | 59.854015  | 1.15 | 10.0   | 0.0          |
| 2    | 2qbp  | 10   | 0.0        | 1.73 | -      | -            |
| 3    | 1ydr  | 10   | 0.0        | 2.17 | -      | -            |
| 4    | 1bzc  | 10   | 0.0        | 1.64 | -      | -            |


<font color = 'red'> **Olhar o caso abaixo: **

| Caso | PDB   | pose | similarity | rmsd | w_rmsd | w_similarity |
|------|-------|------|------------|------|--------|--------------|
| 1    | 2qbq  | 4    | 53.284672  | 1.36 | 10.0   | 0.0          |
| 2    | 2qbq  | 5    | 54.744526  | 4.28 | 0.0    | 0.0          |


<font color = 'yellow'> Como a similaridade da pose 5 deu maior que aimilaridade da pose 4?

<font color = 'yellow'>**Caso 1: as principais interações bateram certinho, mesmo assim a similaridade não ficou acima de 60%**

Caso 2: Porque tem uma pose 10? 

<font color = 'yellow'>**Caso 3: Porque a pose 10 está com rmsd baixo?** 