## Summary

In this notebook we load a network trained to reconstruct protein sequences and use this network to design sequences that fit the geometry of CATH domain [1n5uA03](http://www.cathdb.info/version/latest/domain/1n5uA03).

----

In [1]:
#!wget -r -nH --cut-dirs 1 --reject "index.html*" "http://models.proteinsolver.org/v0.1/"

In [2]:
#!wget https://models.proteinsolver.org/v0.1/notebooks/protein_4xEdgeConv_bs4/e12-s1652709-d6610836.state
#!wget https://models.proteinsolver.org/v0.1/notebooks/protein_train/191f05de/e53-s1952148-d93703104.state

In [3]:
%load_ext autoreload
%autoreload 2

In [46]:
import torch
from Bio import PDB
from proteome import protein
from proteome.models.design.proteinsolver.proteinnet import ProteinNet
from proteome.models.design.proteinsolver.utils.protein_structure import extract_seq_and_adj
from proteome.models.design.proteinsolver.utils.core import Structure
from proteome.models.design.proteinsolver.utils.core.io import load
from proteome.models.design.proteinsolver.utils import dataset
from proteome.models.design.proteinsolver.utils.protein_design import get_node_outputs, design_sequence, design_protein
from proteome.models.design.proteinsolver.utils.protein_sequence import AMINO_ACIDS

In [5]:
batch_size = 1
num_features = 20
adj_input_size = 2
hidden_size = 128
frac_present = 0.5
frac_present_valid = frac_present
info_size= 1024
STRUCTURE_FILE = "5L33.pdb"

In [6]:
state = torch.load("e12-s1652709-d6610836.state", map_location="cpu")

In [16]:
model = ProteinNet(num_features + 1, adj_input_size, hidden_size, num_features)
msg = model.load_state_dict(state)
model = model.cpu()
model = model.eval()

In [17]:
with open("5L33.pdb", mode="r") as f:
    pdb_str = f.read()

In [18]:
structure = load("./5L33.pdb")

In [19]:
#protein_structure = protein.from_pdb_string(pdb_str)
#structure = protein.to_biopdb_structure(protein_structure)

In [20]:
list(structure.models)

[<Model id=0>]

In [21]:
pdata = extract_seq_and_adj(structure, 'A')
print(pdata)
print(f"Protein sequence: '{pdata.sequence}'")
print(f"Number of amino acids: {len(pdata.sequence)}")

ProteinData(sequence='HMPEEEKAARLFIEALEKGDPELMRKVISPDTRMEDNGREFTGDEVVEYVKEIQKRGEQWHLRRYTKEGNSWRFEVQVDNNGQTEQWEVQIEVRNGRIKRVTITHV', row_index=array([  0,   0,   0, ..., 103, 103, 104]), col_index=array([  1,   2,   3, ..., 104, 105, 105]), distances=array([1.32648709, 4.04303277, 5.94734907, ..., 1.33022028, 4.29586604,
       1.32567643]))
Protein sequence: 'HMPEEEKAARLFIEALEKGDPELMRKVISPDTRMEDNGREFTGDEVVEYVKEIQKRGEQWHLRRYTKEGNSWRFEVQVDNNGQTEQWEVQIEVRNGRIKRVTITHV'
Number of amino acids: 106


In [32]:
sequence_ref = pdata.sequence
print(len(sequence_ref), sequence_ref)

106 HMPEEEKAARLFIEALEKGDPELMRKVISPDTRMEDNGREFTGDEVVEYVKEIQKRGEQWHLRRYTKEGNSWRFEVQVDNNGQTEQWEVQIEVRNGRIKRVTITHV


In [47]:
data = dataset.row_to_data(pdata)
data = dataset.transform_edge_attr(data)

In [48]:
residues, residue_probas = design_sequence(
    model, data.cpu(), random_position=False, value_selection_strategy="map", num_categories=20, temperature=0.1
)

In [49]:
"".join([AMINO_ACIDS[i] for i in residues])

'HMPEEEKAARLFIEALEKGDPELMRKVISPDTRMEDNGREFTGDEVVEYVKEIQKRGEQWHLRRYTKEGNSWRFEVQVDNNGQTEQWEVQIEVRNGRIKRVTITHV'

In [50]:
data = dataset.row_to_data(pdata)
data = dataset.transform_edge_attr(data)
data.y = data.x
x_in = torch.ones_like(data.x) * 20

In [52]:
import numpy as np

In [55]:
results = []
design_protein(model, x_in, data.edge_index, data.edge_attr, results=results, cutoff=np.log(0.15))

i: 0; p: 0.0000; num missing: 106; heap size:       0; results size: 0
i: 1000; p: 82.7444; num missing: 2; heap size:    7460; results size: 577
i: 2000; p: 82.9719; num missing: 0; heap size:   14040; results size: 1198


KeyboardInterrupt: 

In [67]:
sequence = "".join([AMINO_ACIDS[i] for i in results[2].x])

In [68]:
from proteome.models.folding.omegafold.modeling import OmegaFoldForFolding

In [69]:
folder = OmegaFoldForFolding()

In [70]:
predicted_protein, confidence = folder.fold(sequence)
result_pdb = protein.to_pdb(predicted_protein)

In [71]:
import py3Dmol

In [72]:
PLDDT_BANDS = [
  (0, 50, '#FF7D45'),
  (50, 70, '#FFDB13'),
  (70, 90, '#65CBF3'),
  (90, 100, '#0053D6')
]
view = py3Dmol.view(width=800, height=600)
view.addModelsAsFrames(result_pdb)

color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)}
style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}

style['stick'] = {}

view.setStyle({'model': -1}, style)
view.zoomTo()

<py3Dmol.view at 0x7fb188b77670>