# Protein Generator

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys, json

from dataclasses import asdict

import py3Dmol
import torch
import numpy as np
from tqdm import tqdm

from proteome import protein
from proteome.models.protein_generator import config
from proteome.models.omegafold.modeling import OmegaFoldForFolding
from proteome.models.protein_generator.modeling import ProteinGeneratorForJointDesign
from proteome.models.protein_generator.sampler import SeqDiffSampler

In [3]:
designer = ProteinGeneratorForJointDesign(model_name="auto", random_seed=0)

## Binder Design

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/binder_design \
    --pdb examples/pdbs/cd86.pdb \
    --T 25 --save_best_plddt \
    --contigs B1-110,0 25-75 \
    --hotspots B40,B32,B87,B96,B30
```

In [4]:
with open("cd86.pdb", mode="r") as f:
    reference_pdb_str = f.read()

reference_structure = protein.Protein27.from_pdb_string(reference_pdb_str)

In [5]:
designed_structure, designed_sequence = designer(
    config.InferenceConfig(
        reference_structure=reference_structure,
        contigmap_params=config.ContigMap(contigs=["B1-110/0 25-75"]),
        hotspot_params=config.HotspotParams(hotspot_res=["B40", "B32", "B87", "B96", "B30"]),
    ),
)[:2]
designed_pdb = designed_structure.to_pdb()

  assert input.numel() == input.storage().size(), (
100%|███████████████████████████████████████████████████████████████| 25/25 [02:00<00:00,  4.81s/it]

AAtype [15 10 10  6  5 16 10 15 19 11  7 11 16 18 11 10  1 12  5 11 15  6  2  7
 16 18 15 17 16  1 10 15 14  1  7  6 14 10  0  5 14 19 15 19  0 14 16 13
  9 12 10 11  9  5  0 18 13  2  6 16  0  3 10 14  4  5 13  0  2 15  5  2
  5 15 10 15  6 10 19 19 13 17  5  3  5  6  2 10 19 10  2  6 19 18 10  7
 11  6 11 13  3 15 19  8 15 11 18 12  7  1 16 15 13  3 15  3 15 17 16 10
  1 10  8  2 10  5  9 11  3 11  7 10 18  5  4  9  9  8  8 11 11 14 16  7
 12  9  1  9  8  5 12  2 15  6 10 15 19 10  0]
Sequence before:  SLLEQTLSVKGKTYKLRMQKSENGTYSWTRLSPRGEPLAQPVSVAPTFIMLKIQAYFNETADLPCQFANSQNQSLSELVVFWQDQENLVLNEVYLGKEKFDSVHSKYMGRTSFDSDSWTLRLHNLQIKDKGLYQCIIHHKKPTGMIRIHQMNSELSVLA
Sequence after:  SLLEQTLSVKGKTYKLRMQKSENGTYSWTRLSPRGEPLAQPVSVAPTFIMLKIQAYFNETADLPCQFANSQNQSLSELVVFWQDQENLVLNEVYLGKEKFDSVHSKYMGRTSFDSDSWTLRLHNLQIKDKGLYQCIIHHKKPTGMIRIHQMNSELSVLA





In [9]:
designed_structure.show()

<py3Dmol.view at 0x7fd083bff6a0>

In [7]:
with open("reference_binder.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [8]:
designed_sequence

'SLLEQTLSVKGKTYKLRMQKSENGTYSWTRLSPRGEPLAQPVSVAPTFIMLKIQAYFNETADLPCQFANSQNQSLSELVVFWQDQENLVLNEVYLGKEKFDSVHSKYMGRTSFDSDSWTLRLHNLQIKDKGLYQCIIHHKKPTGMIRIHQMNSELSVLA'

## Motif Scaffolding

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --pdb examples/pdbs/rsv5_5tpn.pdb \
    --contigs 0-25,A163-181,25-30 --T 25 --save_best_plddt
```

In [4]:
with open("rsv5_5tpn.pdb", mode="r") as f:
    reference_pdb_str = f.read()

reference_structure = protein.Protein27.from_pdb_string(reference_pdb_str)

In [5]:
designed_structure, designed_sequence = designer(
    config.InferenceConfig(
        reference_structure=reference_structure,
        contigmap_params=config.ContigMap(contigs=["0-25/A163-181/25-30"]),
    ),
)[:2]
designed_pdb = designed_structure.to_pdb()

  assert input.numel() == input.storage().size(), (
100%|███████████████████████████████████████████████████████████████| 25/25 [01:04<00:00,  2.60s/it]


In [6]:
#show_structure(designed_pdb)

In [7]:
#with open("reference_motif_scaffolding.pdb", mode="w") as f:
#    f.writelines(designed_pdb)

In [8]:
designed_sequence

'SRPERVIRITPEEVNKIKSALLSTNKAVVSLNGKTIEIDRNDVIKDGEIIIDPNRKIKK'

## Partial Diffusion

```bash
python ./inference.py \
    --num_designs 10 \
    --pdb examples/out/design_000000.pdb \
    --out examples/out/partial_diffusion_design \
    --contigs 38 --sampling_temp 0.3 --T 50 --save_best_plddt
```

In [12]:
with open("design_000000.pdb", mode="r") as f:
    reference_pdb_str = f.read()

reference_structure = protein.Protein27.from_pdb_string(reference_pdb_str)

In [13]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        reference_structure=reference_structure,
        diffuser_params=config.DiffuserParams(T=50),
        contigmap_params=config.ContigMap(contigs=["38"]),
        sampling_temp=0.3,
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:36<00:00,  2.43s/it]


In [14]:
#show_structure(designed_pdb)

In [15]:
with open("reference_partial_diffusion.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [16]:
designed_sequence

'GLSPEELREFLRREGIELRSEEELRELLERLEELRELR'

## Secondary Structure

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 100 \
    --T 25 --save_best_plddt \
    --secondary_structure XXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXX
```

In [18]:
secondary_structure_str = "XXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXXXXXXXHHHHXXXLLLXXXXX"

In [19]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["100"]),
        secondary_structure_params=config.SecondaryStructureParams(
            secondary_structure=secondary_structure_str
        ),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:05<00:00,  2.61s/it]


In [20]:
#show_structure(designed_pdb)

In [21]:
with open("reference_secondary_structure.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [22]:
designed_sequence

'SLEELVRIAKRYGIPLEELISAAREIIALIRAGRKLSAAEIEAIAARFAKKFGLSPEEAREFLLELIEEVAAGGVPSAAEMVALLKALRELVEDLVAIRK'

## Secondary Structure Bias

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 100 \
    --T 25 --save_best_plddt \
    --helix_bias 0.01 --strand_bias 0.01 --loop_bias 0.0 
```

In [24]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["100"]),
        structure_bias_params=config.StructureBiasParams(helix_bias=0.01, strand_bias=0.01)
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:03<00:00,  2.55s/it]


In [25]:
#show_structure(designed_pdb)

In [26]:
with open("reference_secondary_structure_bias.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [27]:
designed_sequence

'SEEELERKKKALEQAKEELEKANRAINEARRALRELDAAQKELIALLEILKDENLSEKEREKRLEEVEEKIKEEQAKLQAEREKINAIREEVEQLLKKAK'

## Secondary Structure from PDB

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 110 \
    --T 25 --save_best_plddt \
    --dssp_pdb examples/pdbs/cd86.pdb
```

In [29]:
with open("cd86.pdb", mode="r") as f:
    dssp_pdb_str = f.read()

dssp_structure = protein.Protein27.from_pdb_string(dssp_pdb_str)

In [30]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["110"]),
        secondary_structure_params=config.SecondaryStructureParams(dssp_structure=dssp_structure),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:08<00:00,  2.73s/it]


In [31]:
#show_structure(designed_pdb)

In [32]:
with open("reference_secondary_structure_from_pdb.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [33]:
designed_sequence

'QPLELTLSGNTLTVKLPEGWSAPEVSGPTVLAYRTLPGAQPLAVAPTFVLSDGGGTVSVSPARLEPPAFVFKAELPQNAKEVEVTLTIYQQVNGKWTLLKQVVFTLTRAP'

## Sequence Conditioning

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --sequence XXXXXXXXXXXXXXXXPEPSEQXXXXXXXXXXXXXXXX \
    --T 25 --save_best_plddt
```

In [35]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        sequence="XXXXXXXXXXXXXXXXPEPSEQXXXXXXXXXXXXXXXX",
        contigmap_params=config.ContigMap(),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:59<00:00,  2.38s/it]


In [36]:
#show_structure(designed_pdb)

In [37]:
with open("reference_sequence_conditioning.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [38]:
designed_sequence

'SLEELLARIEELLEELPEPSEQAKAQLEELLARIKELK'

## Sequence Partial Diffusion

```bash
python ./inference.py \
    --num_designs 10 \
    --sequence SAKVEELLETAKALGISEEEVREILELLEAGFIVIEVVSLGDAVILILENKKLGKYYILKNGEIERIKKPENARELKRKIAEILNISVEEIEAIIEKLRAK \
    --out examples/out/partial_diffusion_design \
    --sampling_temp 0.3 --T 50 --save_best_plddt
```

In [40]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        sequence="SAKVEELLETAKALGISEEEVREILELLEAGFIVIEVVSLGDAVILILENKKLGKYYILKNGEIERIKKPENARELKRKIAEILNISVEEIEAIIEKLRAK",
        diffuser_params=config.DiffuserParams(T=50),
        contigmap_params=config.ContigMap(),
        sampling_temp=0.3,
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:39<00:00,  2.61s/it]


In [41]:
designed_sequence

'GIPPLIIIRIFRIPGITLDEIINFLKNLGFENIEIERLGENYFVIRFRINGREIIIVFDKNGKILDIIFSEEDLKEILEFLKKLGINPEELEKELEKIFPN'

In [42]:
#show_structure(designed_pdb)

In [43]:
with open("reference_sequence_partial_diffusion.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [44]:
designed_sequence

'GIPPLIIIRIFRIPGITLDEIINFLKNLGFENIEIERLGENYFVIRFRINGREIIIVFDKNGKILDIIFSEEDLKEILEFLKKLGINPEELEKELEKIFPN'

## Symmetric Design

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/symmetric_design \
    --contigs 25,0 25,0 25,0 \
    --T 50 \
    --save_best_plddt \
    --symmetry 3
```

In [46]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        diffuser_params=config.DiffuserParams(T=50),
        contigmap_params=config.ContigMap(contigs=["25/0 25/0 25/0"]),
        symmetry_params=config.SymmetryParams(symmetry=3),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:03<00:00,  2.48s/it]


In [47]:
#show_structure(designed_pdb)

In [48]:
with open("reference_symmetric.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [49]:
designed_sequence

'NKIIIELTPEEIEEFLKFIKRIIEENKIIIILTPEEIEEFLKFIKRIIEENKIIIELTPEEIEEFLKFIKRIIEE'

## Unconditional Design

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 100 \
    --T 25 --save_best_plddt
```

In [51]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["100"]),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:06<00:00,  2.65s/it]


In [52]:
#designed_structure.atom_positions[0]

In [53]:
#from proteome.constants import residue_constants

In [54]:
#show_structure(designed_pdb)

In [55]:
with open("reference_unconditional.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [56]:
designed_sequence

'GPPPLSPEEIEELRELLEELAERFGISPEELARFFEPFIRIFLEKDPEELIEELRRFLESGFTREEFVEVSIPEIERYVEKGLLSDEEVEELLEFLERLG'

## Weighted Sequence

```bash
python ./inference.py \
    --num_designs 10 \
    --out examples/out/design \
    --contigs 100 \
    --T 25 --save_best_plddt \
    --potentials aa_bias \
    --aa_composition W0.2 --potential_scale 1.75 
```

In [58]:
designed_structure, designed_sequence = designer.design_structure_and_sequence(
    config.InferenceConfig(
        contigmap_params=config.ContigMap(contigs=["100"]),
        potentials_params=config.PotentialsParams(
            potentials=[config.AACompositionalBiasParams(aa_composition="W0.2")],
            potential_scales=[1.75],
        ),
    ),
)
designed_pdb = designed_structure.to_pdb()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:04<00:00,  2.57s/it]


In [59]:
#show_structure(designed_pdb)

In [60]:
with open("reference_weighted_sequence.pdb", mode="w") as f:
    f.writelines(designed_pdb)

In [61]:
designed_sequence

'GLPEITPEEIEELKKLWEEWKEALEPFLEWLERRGIPIGNPEFIKEFEEFIEELRKEIKNGATREEIIEFFIEEIEELVEKGLITEEEVEEFLKWIERWG'