# Protein Generator

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys, json

from dataclasses import asdict

import py3Dmol
import torch
import numpy as np
from tqdm import tqdm

from proteome import protein
from proteome.models.protein_generator import config
from proteome.models.omegafold.modeling import OmegaFoldForFolding
from proteome.models.protein_generator.modeling import ProteinGeneratorForJointDesign
from proteome.models.protein_generator.sampler import SeqDiffSampler

In [3]:
def show_structure(pdb_str):
    PLDDT_BANDS = [
      (0, 50, '#FF7D45'),
      (50, 70, '#FFDB13'),
      (70, 90, '#65CBF3'),
      (90, 100, '#0053D6')
    ]
    view = py3Dmol.view(width=800, height=600)
    view.addModelsAsFrames(pdb_str)
    
    color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)}
    style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}
    
    style['stick'] = {}
    
    view.setStyle({'model': -1}, style)
    view.zoomTo()
    return view

In [57]:
designer = ProteinGeneratorForJointDesign(model_name="auto", random_seed=0)
#folder = OmegaFoldForFolding()

In [62]:
def get_pdb():
    from glob import glob
    pdbs = glob("reference*.pdb")
    for pdb in pdbs:
        yield pdb

## Binder Design

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/binder_design \
    --pdb examples/pdbs/cd86.pdb \
    --T 25 --save_best_plddt \
    --contigs B1-110,0 25-75 \
    --hotspots B40,B32,B87,B96,B30
```

In [5]:
with open("cd86.pdb", mode="r") as f:
    reference_pdb_str = f.read()

reference_structure = protein.Protein27.from_pdb_string(reference_pdb_str)

In [6]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        reference_structure=reference_structure,
        contigmap_params=config.ContigMap(contigs=["B1-110/0 25-75"]),
        hotspot_params=config.HotspotParams(hotspot_res=["B40", "B32", "B87", "B96", "B30"]),
    ),
)
designed_pdb = designed_structure.to_pdb()

  assert input.numel() == input.storage().size(), (
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:59<00:00,  4.76s/it]


In [7]:
#show_structure(designed_pdb)

In [8]:
with open("reference_binder.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [15]:
designed_sequence

'SLLEQTLSVKGKTYKLRMQKSENGTYSWTRLSPRGEPLAQPVSVAPTFIMLKIQAYFNETADLPCQFANSQNQSLSELVVFWQDQENLVLNEVYLGKEKFDSVHSKYMGRTSFDSDSWTLRLHNLQIKDKGLYQCIIHHKKPTGMIRIHQMNSELSVLA'

## Motif Scaffolding

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --pdb examples/pdbs/rsv5_5tpn.pdb \
    --contigs 0-25,A163-181,25-30 --T 25 --save_best_plddt
```

In [6]:
with open("rsv5_5tpn.pdb", mode="r") as f:
    reference_pdb_str = f.read()

reference_structure = protein.Protein27.from_pdb_string(reference_pdb_str)

In [7]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        reference_structure=reference_structure,
        contigmap_params=config.ContigMap(contigs=["0-25/A163-181/25-30"]),
    ),
)
designed_pdb = designed_structure.to_pdb()

  assert input.numel() == input.storage().size(), (
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:04<00:00,  2.57s/it]


In [8]:
#show_structure(designed_pdb)

In [9]:
with open("reference_motif_scaffolding.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [10]:
designed_sequence

'SRPERVIRITPEEVNKIKSALLSTNKAVVSLNGKTIEIDRNDVIKDGEIIIDPNRKIKK'

## Partial Diffusion

```bash
python ./inference.py \
    --num_designs 10 \
    --pdb examples/out/design_000000.pdb \
    --out examples/out/partial_diffusion_design \
    --contigs 38 --sampling_temp 0.3 --T 50 --save_best_plddt
```

In [12]:
with open("design_000000.pdb", mode="r") as f:
    reference_pdb_str = f.read()

reference_structure = protein.Protein27.from_pdb_string(reference_pdb_str)

In [13]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        reference_structure=reference_structure,
        diffuser_params=config.DiffuserParams(T=50),
        contigmap_params=config.ContigMap(contigs=["38"]),
        sampling_temp=0.3,
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:36<00:00,  2.43s/it]


In [14]:
#show_structure(designed_pdb)

In [15]:
with open("reference_partial_diffusion.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [16]:
designed_sequence

'GLSPEELREFLRREGIELRSEEELRELLERLEELRELR'

## Secondary Structure

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 100 \
    --T 25 --save_best_plddt \
    --secondary_structure XXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXX
```

In [18]:
secondary_structure_str = "XXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXX"

In [19]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["100"]),
        secondary_structure_params=config.SecondaryStructureParams(
            secondary_structure=secondary_structure_str
        ),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:05<00:00,  2.61s/it]


In [20]:
#show_structure(designed_pdb)

In [21]:
with open("reference_secondary_structure.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [22]:
designed_sequence

'SLEELVRIAKRYGIPLEELISAAREIIALIRAGRKLSAAEIEAIAARFAKKFGLSPEEAREFLLELIEEVAAGGVPSAAEMVALLKALRELVEDLVAIRK'

## Secondary Structure Bias

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 100 \
    --T 25 --save_best_plddt \
    --helix_bias 0.01 --strand_bias 0.01 --loop_bias 0.0 
```

In [24]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["100"]),
        structure_bias_params=config.StructureBiasParams(helix_bias=0.01, strand_bias=0.01)
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:03<00:00,  2.55s/it]


In [25]:
#show_structure(designed_pdb)

In [26]:
with open("reference_secondary_structure_bias.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [27]:
designed_sequence

'SEEELERKKKALEQAKEELEKANRAINEARRALRELDAAQKELIALLEILKDENLSEKEREKRLEEVEEKIKEEQAKLQAEREKINAIREEVEQLLKKAK'

## Secondary Structure from PDB

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 110 \
    --T 25 --save_best_plddt \
    --dssp_pdb examples/pdbs/cd86.pdb
```

In [29]:
with open("cd86.pdb", mode="r") as f:
    dssp_pdb_str = f.read()

dssp_structure = protein.Protein27.from_pdb_string(dssp_pdb_str)

In [30]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["110"]),
        secondary_structure_params=config.SecondaryStructureParams(dssp_structure=dssp_structure),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:08<00:00,  2.73s/it]


In [31]:
#show_structure(designed_pdb)

In [32]:
with open("reference_secondary_structure_from_pdb.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [33]:
designed_sequence

'QPLELTLSGNTLTVKLPEGWSAPEVSGPTVLAYRTLPGAQPLAVAPTFVLSDGGGTVSVSPARLEPPAFVFKAELPQNAKEVEVTLTIYQQVNGKWTLLKQVVFTLTRAP'

## Sequence Conditioning

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --sequence XXXXXXXXXXXXXXXXPEPSEQXXXXXXXXXXXXXXXX \
    --T 25 --save_best_plddt
```

In [35]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        sequence="XXXXXXXXXXXXXXXXPEPSEQXXXXXXXXXXXXXXXX",
        contigmap_params=config.ContigMap(),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:59<00:00,  2.38s/it]


In [36]:
#show_structure(designed_pdb)

In [37]:
with open("reference_sequence_conditioning.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [38]:
designed_sequence

'SLEELLARIEELLEELPEPSEQAKAQLEELLARIKELK'

## Sequence Partial Diffusion

```bash
python ./inference.py \
    --num_designs 10 \
    --sequence SAKVEELLETAKALGISEEEVREILELLEAGFIVIEVVSLGDAVILILENKKLGKYYILKNGEIERIKKPENARELKRKIAEILNISVEEIEAIIEKLRAK \
    --out examples/out/partial_diffusion_design \
    --sampling_temp 0.3 --T 50 --save_best_plddt
```

In [40]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        sequence="SAKVEELLETAKALGISEEEVREILELLEAGFIVIEVVSLGDAVILILENKKLGKYYILKNGEIERIKKPENARELKRKIAEILNISVEEIEAIIEKLRAK",
        diffuser_params=config.DiffuserParams(T=50),
        contigmap_params=config.ContigMap(),
        sampling_temp=0.3,
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:39<00:00,  2.61s/it]


In [41]:
designed_sequence

'GIPPLIIIRIFRIPGITLDEIINFLKNLGFENIEIERLGENYFVIRFRINGREIIIVFDKNGKILDIIFSEEDLKEILEFLKKLGINPEELEKELEKIFPN'

In [42]:
#show_structure(designed_pdb)

In [43]:
with open("reference_sequence_partial_diffusion.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [44]:
designed_sequence

'GIPPLIIIRIFRIPGITLDEIINFLKNLGFENIEIERLGENYFVIRFRINGREIIIVFDKNGKILDIIFSEEDLKEILEFLKKLGINPEELEKELEKIFPN'

## Symmetric Design

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/symmetric_design \
    --contigs 25,0 25,0 25,0 \
    --T 50 \
    --save_best_plddt \
    --symmetry 3
```

In [46]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        diffuser_params=config.DiffuserParams(T=50),
        contigmap_params=config.ContigMap(contigs=["25/0 25/0 25/0"]),
        symmetry_params=config.SymmetryParams(symmetry=3),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:03<00:00,  2.48s/it]


In [47]:
#show_structure(designed_pdb)

In [48]:
with open("reference_symmetric.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [49]:
designed_sequence

'NKIIIELTPEEIEEFLKFIKRIIEENKIIIILTPEEIEEFLKFIKRIIEENKIIIELTPEEIEEFLKFIKRIIEE'

## Unconditional Design

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 100 \
    --T 25 --save_best_plddt
```

In [51]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["100"]),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:06<00:00,  2.65s/it]


In [52]:
#designed_structure.atom_positions[0]

In [53]:
#from proteome.constants import residue_constants

In [54]:
#show_structure(designed_pdb)

In [55]:
with open("reference_unconditional.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [56]:
designed_sequence

'GPPPLSPEEIEELRELLEELAERFGISPEELARFFEPFIRIFLEKDPEELIEELRRFLESGFTREEFVEVSIPEIERYVEKGLLSDEEVEELLEFLERLG'

## Weighted Sequence

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 100 \
    --T 25 --save_best_plddt \
    --potentials aa_bias \
    --aa_composition W0.2 --potential_scale 1.75 
```

In [58]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["100"]),
        potentials_params=config.PotentialsParams(
            potentials=[config.AACompositionalBiasParams(aa_composition="W0.2")],
            potential_scales=[1.75],
        ),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:04<00:00,  2.57s/it]


In [59]:
#show_structure(designed_pdb)

In [60]:
with open("reference_weighted_sequence.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [61]:
designed_sequence

'GLPEITPEEIEELKKLWEEWKEALEPFLEWLERRGIPIGNPEFIKEFEEFIEELRKEIKNGATREEIIEFFIEEIEELVEKGLITEEEVEEFLKWIERWG'

## Folding

In [None]:
predicted_protein, confidence = folder.fold(designed_sequence)
folded_pdb = protein.to_pdb(predicted_protein)

In [None]:
PLDDT_BANDS = [
  (0, 50, '#FF7D45'),
  (50, 70, '#FFDB13'),
  (70, 90, '#65CBF3'),
  (90, 100, '#0053D6')
]
view = py3Dmol.view(width=800, height=600)
view.addModelsAsFrames(folded_pdb)

color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)}
style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}

style['stick'] = {}

view.setStyle({'model': -1}, style)
view.zoomTo()