In [25]:
%load_ext autoreload
%autoreload 2

import os
from Bio import PDB
import numpy as np
from pathlib import Path
import py3Dmol
import warnings
import Bio

from benchmark import visualize_structure_alignment, plot_with_error

warnings.simplefilter("ignore", Bio.PDB.PDBExceptions.PDBConstructionWarning)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
truth_dir = "data/casp15_targets_TSdomains_4invitees"
# pred_dir = "data/foldtoken_level_8_casp"
pred_dir = "data/bio2token_casp"

truth_files = [f.split(".")[0] for f in os.listdir(truth_dir) if f.endswith(".pdb")]
pred_files = [f.split(".")[0] for f in os.listdir(pred_dir) if f.endswith(".pdb")]
pred_files.sort()
truth_files.sort()

In [None]:
truth_pred_files = list(zip(truth_files, pred_files))

parser = PDB.PDBParser()
rmsd_results = {}
tm_results = {}

for truth_name, pred_name in truth_pred_files:
    print(truth_name, pred_name)
    if truth_name != pred_name:
        raise ValueError(
            f"Truth file {truth_name} does not match prediction file {pred_name}"
        )

    truth_structure = parser.get_structure(
        truth_name, os.path.join(truth_dir, truth_name + ".pdb")
    )
    # print(list(truth_structure.get_atoms()))

    pred_structure = parser.get_structure(
        pred_name, os.path.join(pred_dir, pred_name + ".pdb")
    )
    # print(list(pred_structure.get_atoms()))

    try:
        rmsd, view = visualize_structure_alignment(
            truth_structure, pred_structure, parser
        )
        # view.show()
    except Exception as e:
        print(e)
        rmsd = None

    rmsd_results[truth_name] = rmsd


T1104-D1 T1104-D1
Number of CA atoms in truth structure: 117
Number of CA atoms in predicted structure: 117
T1106s1-D1 T1106s1-D1
Number of CA atoms in truth structure: 71
Number of CA atoms in predicted structure: 71
T1106s2-D1 T1106s2-D1
Number of CA atoms in truth structure: 111
Number of CA atoms in predicted structure: 111
T1109-D1 T1109-D1
Number of CA atoms in truth structure: 214
Number of CA atoms in predicted structure: 214
T1110-D1 T1110-D1
Number of CA atoms in truth structure: 221
Number of CA atoms in predicted structure: 221
T1112-D1 T1112-D1
Number of CA atoms in truth structure: 460
Number of CA atoms in predicted structure: 460
T1113-D1 T1113-D1
Number of CA atoms in truth structure: 167
Number of CA atoms in predicted structure: 167
T1114s1-D1 T1114s1-D1
Number of CA atoms in truth structure: 60
Number of CA atoms in predicted structure: 60
T1114s2-D1 T1114s2-D1
Number of CA atoms in truth structure: 322
Number of CA atoms in predicted structure: 322
T1114s3-D1 T1114

In [64]:
import pandas as pd

df = pd.DataFrame(list(rmsd_results.items()), columns=["Target", "RMSD"])
display(df)

df.to_csv("./data/results/bio2token_casp_rmsd_results.csv", index=False)

Unnamed: 0,Target,RMSD
0,T1104-D1,0.737521
1,T1106s1-D1,0.894725
2,T1106s2-D1,0.724119
3,T1109-D1,0.706416
4,T1110-D1,0.684899
...,...,...
91,T1184-D1,0.744754
92,T1186-D1,0.813125
93,T1187-D1,0.841799
94,T1188-D1,0.965056


In [55]:
print(*list(rmsd_results.items()), sep="\n")

('T1104-D1', np.float64(0.7375210365317114))
('T1106s1-D1', np.float64(0.8947246370450026))
('T1106s2-D1', np.float64(0.72411869259661))
('T1109-D1', np.float64(0.7064158320909897))
('T1110-D1', np.float64(0.6848992397940856))
('T1112-D1', np.float64(0.903430476150939))
('T1113-D1', np.float64(0.8832131369795045))
('T1114s1-D1', np.float64(0.952588220148387))
('T1114s2-D1', np.float64(0.8158035590181357))
('T1114s3-D1', np.float64(0.8585007893154994))
('T1119-D1', np.float64(0.7029347343982892))
('T1120-D1', np.float64(0.7553508965530057))
('T1120-D2', np.float64(0.7950902431762644))
('T1121-D1', np.float64(0.8901223060991335))
('T1121-D2', np.float64(0.9450557219155139))
('T1122-D1', np.float64(0.9447172469669858))
('T1123-D1', np.float64(0.7854059775194785))
('T1124-D1', np.float64(0.9601520089708547))
('T1125-D1', np.float64(0.8021868734169865))
('T1125-D2', None)
('T1125-D3', None)
('T1125-D4', None)
('T1125-D5', None)
('T1125-D6', np.float64(0.7863091867584056))
('T1127-D1', np.fl

In [56]:
plot_with_error(list(rmsd_results.values()))


TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'