In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import multiprocessing
import os, sys
import argparse
import logging
import json
from pathlib import Path
from typing import *

import numpy as np
import pandas as pd
import mpl_scatter_density
from matplotlib import pyplot as plt
from astropy.visualization import LogStretch
from astropy.visualization.mpl_normalize import ImageNormalize

import torch
from huggingface_hub import snapshot_download


from proteome.models.design.foldingdiff.model import BertForDiffusionBase
from proteome.models.design.foldingdiff import sampling
from proteome.models.design.foldingdiff.datasets import AnglesEmptyDataset, CathCanonicalAnglesOnlyDataset, NoisedAnglesDataset
from proteome.models.design.foldingdiff.angles_and_coords import create_new_chain_nerf
from proteome.models.design.foldingdiff import utils

  from .autonotebook import tqdm as notebook_tqdm
2023-07-29 18:08:22.774032: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [72]:
#dl_path = snapshot_download("wukevin/foldingdiff_cath")

In [69]:
dl_path

'/home/conradry71/.cache/huggingface/hub/models--wukevin--foldingdiff_cath/snapshots/98d77b1e68468db5ca03cdba1c0a90f2a2a33edc'

In [36]:
#torch.load("epoch=1488-step=565820.ckpt")

In [37]:
import wget

In [43]:
ckpt = torch.hub.load_state_dict_from_url(
    "https://huggingface.co/wukevin/foldingdiff_cath/resolve/main/models/best_by_valid/epoch%3D1488-step%3D565820.ckpt",
    map_location="cpu", 
)

Downloading: "https://huggingface.co/wukevin/foldingdiff_cath/resolve/main/models/best_by_valid/epoch%3D1488-step%3D565820.ckpt" to /home/conradry71/.cache/torch/hub/checkpoints/epoch%3D1488-step%3D565820.ckpt
100%|██████████████████████████████████████████████████| 55.2M/55.2M [00:00<00:00, 204MB/s]


In [None]:
BertForDiffusionBase(

In [4]:
model_snapshot_dir = os.path.join(torch.hub.get_dir(), "foldingdiff_cath")
model = BertForDiffusionBase.from_dir(dl_path, copy_to=model_snapshot_dir)
model = model.cuda()

Using time embedding: GaussianFourierProjection()


In [5]:
sum(p[1].numel() for p in model.named_parameters())

14458758

In [73]:
ckpt.keys()

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops'])

In [6]:
with open(os.path.join(dl_path, "training_args.json"), mode="r") as f:
    training_args = json.load(f)

In [26]:
training_args

{'angles_definitions': 'canonical-full-angles',
 'max_seq_len': 128,
 'min_seq_len': 40,
 'trim_strategy': 'randomcrop',
 'zero_center': True,
 'timesteps': 1000,
 'variance_schedule': 'cosine',
 'variance_scale': 1.0,
 'time_encoding': 'gaussian_fourier',
 'num_hidden_layers': 12,
 'hidden_size': 384,
 'intermediate_size': 768,
 'num_heads': 12,
 'position_embedding_type': 'relative_key',
 'dropout_p': 0.1,
 'decoder': 'mlp',
 'gradient_clip': 1.0,
 'batch_size': 64,
 'lr': 5e-05,
 'loss': 'smooth_l1',
 'l2_norm': 0.0,
 'l1_norm': 0.0,
 'circle_reg': 0.0,
 'min_epochs': 10000,
 'max_epochs': 10000,
 'early_stop_patience': 0,
 'lr_scheduler': 'LinearWarmup',
 'use_swa': False,
 'subset': None,
 'exhaustive_validation_t': False,
 'syn_noiser': '',
 'single_angle_debug': -1,
 'single_timestep_debug': False,
 'cpu_only': False,
 'ngpu': -1,
 'write_valid_preds': False,
 'dryrun': True,
 'multithread': True}

In [70]:
mean_offset = np.load(os.path.join(dl_path, "training_mean_offset.npy"))

In [59]:
placeholder_dset = AnglesEmptyDataset(
    feature_set_key=training_args["angles_definitions"],
    pad=training_args["max_seq_len"],
    mean_offset=mean_offset,
)

noised_dset = NoisedAnglesDataset(
    dset=placeholder_dset,
    dset_key="coords"
    if training_args["angles_definitions"] == "cart-coords"
    else "angles",
    timesteps=training_args["timesteps"],
    exhaustive_t=False,
    beta_schedule=training_args["variance_schedule"],
    nonangular_variance=1.0,
    angular_variance=training_args["variance_scale"],
)

In [60]:
n = 1
sweep_min_len, sweep_max_len = 85, 86
sampled = sampling.sample(
    model, noised_dset, n=n, sweep_lengths=(sweep_min_len, sweep_max_len), disable_pbar=False
)

sampling loop time step: 100%|█████████████████████████| 1000/1000 [00:15<00:00, 62.68it/s]


In [61]:
final_sampled = [s[-1] for s in sampled]

In [62]:
sampled_dfs = [
    pd.DataFrame(s, columns=noised_dset.feature_names["angles"])
    for s in final_sampled
]

In [52]:
#sampled_dfs[22]

In [None]:
protein.Protein(

In [63]:
out = create_new_chain_nerf("./generated.pdb", sampled_dfs[-1])

In [64]:
with open("generated.pdb", mode="r") as f:
    generated_pdb = f.read()

In [57]:
import py3Dmol

from proteome import protein
from proteome.models.design.proteinmpnn import config
from proteome.models.design.proteinmpnn.modeling import ProteinMPNNForSequenceDesign
from proteome.models.folding.omegafold.modeling import OmegaFoldForFolding

In [16]:
designer = ProteinMPNNForSequenceDesign("vanilla_model-20", random_seed=37)
folder = OmegaFoldForFolding()

In [17]:
target_protein = protein.from_pdb_string(
    generated_pdb, ca_only=designer.ca_only, backbone_only=(not designer.ca_only)
)

In [18]:
sequence, score = designer.design_sequence(target_protein)
print(f"Sequence: {sequence} with global_score {score}")

Sequence: LHHLHLHLHLELLLLLEELLPLLLLPPYLLLPLLPPLPLLLPPLLLLLLLLLLPLLLLPLLLLHHHLEPPPLLLLLLYLLLLLLL with global_score 1.203758716583252


In [19]:
predicted_protein, confidence = folder.fold(sequence)
result_pdb = protein.to_pdb(predicted_protein)

In [65]:
PLDDT_BANDS = [
  (0, 50, '#FF7D45'),
  (50, 70, '#FFDB13'),
  (70, 90, '#65CBF3'),
  (90, 100, '#0053D6')
]
view = py3Dmol.view(width=800, height=600)
view.addModelsAsFrames(generated_pdb)

color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)}
style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}

style['stick'] = {}

view.setStyle({'model': -1}, style)
view.zoomTo()

<py3Dmol.view at 0x7f2b10693b50>

In [21]:
PLDDT_BANDS = [
  (0, 50, '#FF7D45'),
  (50, 70, '#FFDB13'),
  (70, 90, '#65CBF3'),
  (90, 100, '#0053D6')
]
view = py3Dmol.view(width=800, height=600)
view.addModelsAsFrames(result_pdb)

color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)}
style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}

style['stick'] = {}

view.setStyle({'model': -1}, style)
view.zoomTo()

<py3Dmol.view at 0x7f2c5d3dcfd0>

In [45]:
target_protein = protein.from_pdb_string(
    result_pdb, ca_only=designer.ca_only, backbone_only=(not designer.ca_only)
)

In [46]:
sequence, score = designer.design_sequence(target_protein)
print(f"Sequence: {sequence} with global_score {score}")

Sequence: LCLLCCLHCHLCHPPWARLGHLLGLLLHLLLLLLLLPPLLLLLLLLGHCLLCHLLHPPALLLLHLLLHLRKPLLLPILLLALLGLLLLLLLLRLLPPPLRLLILLHALLHLLGALLHLLALLLLLLL with global_score 1.2596246004104614


In [47]:
predicted_protein, confidence = folder.fold(sequence)
result_pdb = protein.to_pdb(predicted_protein)

In [48]:
PLDDT_BANDS = [
  (0, 50, '#FF7D45'),
  (50, 70, '#FFDB13'),
  (70, 90, '#65CBF3'),
  (90, 100, '#0053D6')
]
view = py3Dmol.view(width=800, height=600)
view.addModelsAsFrames(result_pdb)

color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)}
style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}

style['stick'] = {}

view.setStyle({'model': -1}, style)
view.zoomTo()

<py3Dmol.view at 0x7f7331710e20>