In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import multiprocessing
import os, sys
import argparse
import logging
import json
from pathlib import Path
from typing import *

import numpy as np
import pandas as pd
import mpl_scatter_density
from matplotlib import pyplot as plt
from astropy.visualization import LogStretch
from astropy.visualization.mpl_normalize import ImageNormalize

import torch
from huggingface_hub import snapshot_download


from proteome.models.design.foldingdiff import modelling
from proteome.models.design.foldingdiff import sampling
from proteome.models.design.foldingdiff import plotting
from proteome.models.design.foldingdiff.datasets import AnglesEmptyDataset, NoisedAnglesDataset
from proteome.models.design.foldingdiff.angles_and_coords import create_new_chain_nerf
from proteome.models.design.foldingdiff import utils
from proteome.models.design.foldingdiff.train import get_train_valid_test_sets

  from .autonotebook import tqdm as notebook_tqdm
2023-07-29 03:55:49.382553: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
dl_path = snapshot_download("wukevin/foldingdiff_cath")

Fetching 6 files: 100%|████████████████████████████████████████| 6/6 [00:00<00:00, 49344.75it/s]


In [12]:
!ls {dl_path}

README.md  config.json	models	training_args.json  training_mean_offset.npy


In [4]:
model_snapshot_dir = os.path.join(torch.hub.get_dir(), "foldingdiff_cath")
model = modelling.BertForDiffusionBase.from_dir(dl_path, copy_to=model_snapshot_dir)
model = model.cuda()

Using time embedding: GaussianFourierProjection()


In [5]:
sum(p[1].numel() for p in model.named_parameters())

14458758

In [13]:
with open(os.path.join(dl_path, "training_args.json"), mode="r") as f:
    training_args = json.load(f)

In [14]:
mean_offset = np.load(os.path.join(dl_path, "training_mean_offset.npy"))

In [15]:
placeholder_dset = AnglesEmptyDataset(
    feature_set_key=training_args["angles_definitions"],
    pad=training_args["max_seq_len"],
    mean_offset=mean_offset,
)

noised_dsets = [
    NoisedAnglesDataset(
        dset=placeholder_dset,
        dset_key="coords"
        if training_args["angles_definitions"] == "cart-coords"
        else "angles",
        timesteps=training_args["timesteps"],
        exhaustive_t=False,
        beta_schedule=training_args["variance_schedule"],
        nonangular_variance=1.0,
        angular_variance=training_args["variance_scale"],
    )
    for _ in range(3)
]

In [49]:
n = 1
sweep_min_len, sweep_max_len = 200, 201
sampled = sampling.sample(
    model, noised_dsets[0], n=n, sweep_lengths=(sweep_min_len, sweep_max_len), disable_pbar=False
)

sampling loop time step: 100%|██████████████████████████████| 1000/1000 [00:15<00:00, 64.97it/s]


In [50]:
final_sampled = [s[-1] for s in sampled]

In [51]:
sampled_dfs = [
    pd.DataFrame(s, columns=noised_dsets[0].feature_names["angles"])
    for s in final_sampled
]

In [52]:
sampled_dfs[0]

Unnamed: 0,phi,psi,omega,tau,CA:C:1N,C:1N:1CA
0,-1.439930,2.110073,2.999366,1.889829,2.020859,2.138032
1,-1.768185,2.272329,3.139795,1.897599,2.020759,2.146705
2,-1.923520,2.079623,3.038984,1.949959,2.049577,2.156932
3,-1.889570,2.053707,3.126452,1.916538,2.018767,2.143047
4,-1.747079,2.255037,-3.040499,1.941199,2.029085,2.137985
...,...,...,...,...,...,...
123,-1.345607,2.290400,3.076068,1.902046,2.031972,2.143218
124,-2.111587,2.260270,2.995788,1.892806,2.028688,2.139474
125,-1.882213,2.131870,3.021856,1.880626,2.026069,2.122019
126,-1.776762,1.887100,3.004518,1.895268,2.026029,2.135754


In [53]:
out = create_new_chain_nerf("./generated.pdb", sampled_dfs[0])

In [54]:
with open("generated.pdb", mode="r") as f:
    generated_pdb = f.read()

In [55]:
import py3Dmol

from proteome import protein
from proteome.models.design.proteinmpnn import config
from proteome.models.design.proteinmpnn.modeling import ProteinMPNNForSequenceDesign
from proteome.models.folding.omegafold.modeling import OmegaFoldForFolding

In [56]:
designer = ProteinMPNNForSequenceDesign("vanilla_model-20", random_seed=37)
folder = OmegaFoldForFolding()

In [57]:
target_protein = protein.from_pdb_string(
    generated_pdb, ca_only=designer.ca_only, backbone_only=(not designer.ca_only)
)

In [58]:
sequence, score = designer.design_sequence(target_protein)
print(f"Sequence: {sequence} with global_score {score}")

Sequence: PLPPPPPPPLPPPPPLLLLLLLLLLLLKLLLLPLPLLLLLLLPPPLLLLLLLLRLLRLLLLELLLLLPPPPPLPLLLLLLPLPLPELLELLLLLLLLLPLLLLPPLPPPLEPLHLLPLLPLPLLPPPP with global_score 1.307490587234497


In [59]:
predicted_protein, confidence = folder.fold(sequence)
result_pdb = protein.to_pdb(predicted_protein)

In [60]:
PLDDT_BANDS = [
  (0, 50, '#FF7D45'),
  (50, 70, '#FFDB13'),
  (70, 90, '#65CBF3'),
  (90, 100, '#0053D6')
]
view = py3Dmol.view(width=800, height=600)
view.addModelsAsFrames(generated_pdb)

color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)}
style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}

style['stick'] = {}

view.setStyle({'model': -1}, style)
view.zoomTo()

<py3Dmol.view at 0x7ff3017bedd0>

In [61]:
PLDDT_BANDS = [
  (0, 50, '#FF7D45'),
  (50, 70, '#FFDB13'),
  (70, 90, '#65CBF3'),
  (90, 100, '#0053D6')
]
view = py3Dmol.view(width=800, height=600)
view.addModelsAsFrames(result_pdb)

color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)}
style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}

style['stick'] = {}

view.setStyle({'model': -1}, style)
view.zoomTo()

<py3Dmol.view at 0x7ff42d76cbe0>

In [45]:
target_protein = protein.from_pdb_string(
    result_pdb, ca_only=designer.ca_only, backbone_only=(not designer.ca_only)
)

In [46]:
sequence, score = designer.design_sequence(target_protein)
print(f"Sequence: {sequence} with global_score {score}")

Sequence: LAALAPLLLPLLAATAAAAAALPPAGYAAYLAALLAALAALLAAL with global_score 1.0782400369644165


In [47]:
predicted_protein, confidence = folder.fold(sequence)
result_pdb = protein.to_pdb(predicted_protein)

In [48]:
PLDDT_BANDS = [
  (0, 50, '#FF7D45'),
  (50, 70, '#FFDB13'),
  (70, 90, '#65CBF3'),
  (90, 100, '#0053D6')
]
view = py3Dmol.view(width=800, height=600)
view.addModelsAsFrames(result_pdb)

color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)}
style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}

style['stick'] = {}

view.setStyle({'model': -1}, style)
view.zoomTo()

<py3Dmol.view at 0x7ff42d76c2b0>